diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7296e07ca396e7..3a4bae2984534b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,9 +7,11 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest ftfy spacy
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --cov
+            - run: codecov
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
@@ -17,10 +19,11 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest spacy
-            - run: sudo pip install ftfy==4.4.3
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --cov
+            - run: codecov
 workflows:
   version: 2
   build_and_test:
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000000000..fe05dda9a8f2c2
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+source=pytorch_pretrained_bert
+[report]
+exclude_lines =
+    pragma: no cover
+    raise
+    except
+    register_parameter
\ No newline at end of file
diff --git a/README.md b/README.md
index fde35d23ea9d72..2c67dc65e89140 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ This package comprises the following classes that can be imported in Python and
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 - Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
-  - `OpenAIGPTAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+  - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
@@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == '.</w>'
 ```
 
+And how to use `OpenAIGPTDoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+```
+
 ### Transformer-XL
 
 Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
@@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
 predicted_token = tokenizer.decode([predicted_index])
 ```
 
+And how to use `GPT2DoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
+```
+
+
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
@@ -471,10 +516,12 @@ Here is a detailed documentation of the classes in the package and how to use th
 
 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
+### `from_pretrained()` method
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
 
 ```python
-model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
+model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
 ```
 
 where
@@ -491,9 +538,14 @@ where
     - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
+    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
 
   - a path or url to a pretrained model archive containing:
 
@@ -501,7 +553,12 @@ where
     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
   If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
+- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
+- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
 
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
 
@@ -527,6 +584,22 @@ model = GPT2Model.from_pretrained('gpt2')
 
 ```
 
+#### Cache directory
+
+`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
+- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
+- PyTorch cache home + `/pytorch_pretrained_bert/`
+  where PyTorch cache home is defined by (in this order):
+  - shell environment variable `ENV_TORCH_HOME`
+  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
+  - default: `~/.cache/torch/`
+
+Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+
+You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
 ### Serialization best-practices
 
 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
@@ -536,6 +609,15 @@ There are three types of files you need to save to be able to reload a fine-tune
 - the configuration file of the model which is saved as a JSON file, and
 - the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 
+The *default filenames* of these files are as follow:
+
+- the model weights file: `pytorch_model.bin`,
+- the configuration file: `config.json`,
+- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
+- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
+
+**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
+
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python
@@ -627,6 +709,13 @@ These configuration classes contains a few utilities to load and save configurat
 
 `BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `BertConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -635,6 +724,7 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs* a tuple composed of:
 
@@ -752,6 +842,13 @@ where total_tokens_embeddings can be obtained as config.total_tokens_embeddings
     `total_tokens_embeddings = config.vocab_size + config.n_special`
 You should use the associate indices to index the embeddings.
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `OpenAIConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -762,9 +859,10 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
-- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
 #### 10. `OpenAIGPTLMHeadModel`
 
@@ -844,6 +942,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]
 
 `GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `GPT2Config` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -855,9 +960,10 @@ We detail them here. This model takes as *inputs*:
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
 - `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
-- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
 
 #### 15. `GPT2LMHeadModel`
@@ -984,19 +1090,48 @@ The optimizer accepts the following arguments:
 - `warmup` : portion of `t_total` for the warmup, `-1`  means no warmup. Default : `-1`
 - `t_total` : total number of training steps for the learning
     rate schedule, `-1`  means constant learning rate. Default : `-1`
-- `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'`
+- `schedule` : schedule to use for the warmup (see above).
+    Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+    If `None` or `'none'`, learning rate is always kept constant.
+    Default : `'warmup_linear'`
 - `b1` : Adams b1. Default : `0.9`
 - `b2` : Adams b2. Default : `0.999`
 - `e` : Adams epsilon. Default : `1e-6`
 - `weight_decay:` Weight decay. Default : `0.01`
 - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
 
-#### `OpenAIGPTAdam`
-
-`OpenAIGPTAdam` is similar to `BertAdam`.
-The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as in the regular Adam optimizer.
-
-`OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
+#### `OpenAIAdam`
+
+`OpenAIAdam` is similar to `BertAdam`.
+The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in the regular Adam optimizer.
+
+`OpenAIAdam` accepts the same arguments as `BertAdam`.
+
+#### Learning Rate Schedules
+The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
+All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
+When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, 
+the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. 
+An overview of the implemented schedules:
+- `ConstantLR`: always returns learning rate 1.
+- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    ![](docs/imgs/warmup_constant_schedule.png)
+- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    ![](docs/imgs/warmup_linear_schedule.png)
+-  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    ![](docs/imgs/warmup_cosine_schedule.png)
+- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
+    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
+    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
 
 ## Examples
 
@@ -1004,7 +1139,7 @@ The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as i
 |-|-|
 | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
 | [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` |
-| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#Fine-tuning-with-OpenAI-GPT-Transformer-XL-and-GPT-2) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
+| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#openai-gpt-transformer-xl-and-gpt-2-running-the-examples) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
 | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|
 
 ### Training large models: introduction, tools and examples
@@ -1136,6 +1271,46 @@ python run_classifier.py \
   --fp16
 ```
 
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+  acc = 0.8823529411764706
+  acc_and_f1 = 0.901702786377709
+  eval_loss = 0.3418912578906332
+  f1 = 0.9210526315789473
+  global_step = 174
+  loss = 0.07231863956341798
+```
+
+Here is an example on MNLI:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+```
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
+
+
 #### SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
@@ -1166,9 +1341,52 @@ python run_squad.py \
 
 Training with the previous hyper-parameters gave us the following results:
 ```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
+**distributed training**
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+ run_squad.py \
+ --bert_model bert-large-uncased-whole-word-masking  \
+ --do_train \
+ --do_predict \
+ --do_lower_case \
+ --train_file $SQUAD_DIR/train-v1.1.json \
+ --predict_file $SQUAD_DIR/dev-v1.1.json \
+ --learning_rate 3e-5 \
+ --num_train_epochs 2 \
+ --max_seq_length 384 \
+ --doc_stride 128 \
+ --output_dir ../models/wwm_uncased_finetuned_squad/ \
+ --train_batch_size 24 \
+ --gradient_accumulation_steps 12
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+```
+
+This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
+
+And here is the model provided as `bert-large-cased-whole-word-masking-finetuned-squad`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8  run_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+{"exact_match": 84.18164616840113, "f1": 91.58645594850135}
+```
+
 #### SWAG
 
 The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
@@ -1325,6 +1543,42 @@ The results were similar to the above FP32 results (actually slightly higher):
 {"exact_match": 84.65468306527909, "f1": 91.238669287002}
 ```
 
+Here is an example with the recent `bert-large-uncased-whole-word-masking`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+  run_squad.py \
+  --bert_model bert-large-uncased-whole-word-masking \
+  --do_train \
+  --do_predict \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/ \
+  --train_batch_size 24 \
+  --gradient_accumulation_steps 2
+```
+
+## BERTology
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+
+- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+
+- accessing all the hidden-states of BERT/GPT/GPT-2,
+- accessing all the attention weights for each head of BERT/GPT/GPT-2,
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: [`bertology.py`](./examples/bertology.py) while extract information and prune a model pre-trained on MRPC.
+
 ## Notebooks
 
 We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
diff --git a/docs/imgs/warmup_constant_schedule.png b/docs/imgs/warmup_constant_schedule.png
new file mode 100644
index 00000000000000..e2448e9f2c7999
Binary files /dev/null and b/docs/imgs/warmup_constant_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_hard_restarts_schedule.png b/docs/imgs/warmup_cosine_hard_restarts_schedule.png
new file mode 100644
index 00000000000000..be73605b9c080c
Binary files /dev/null and b/docs/imgs/warmup_cosine_hard_restarts_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_schedule.png b/docs/imgs/warmup_cosine_schedule.png
new file mode 100644
index 00000000000000..6d27926ab10e9d
Binary files /dev/null and b/docs/imgs/warmup_cosine_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_warm_restarts_schedule.png b/docs/imgs/warmup_cosine_warm_restarts_schedule.png
new file mode 100644
index 00000000000000..71b39bffd3dacc
Binary files /dev/null and b/docs/imgs/warmup_cosine_warm_restarts_schedule.png differ
diff --git a/docs/imgs/warmup_linear_schedule.png b/docs/imgs/warmup_linear_schedule.png
new file mode 100644
index 00000000000000..4e1af31025fafb
Binary files /dev/null and b/docs/imgs/warmup_linear_schedule.png differ
diff --git a/examples/bertology.py b/examples/bertology.py
new file mode 100644
index 00000000000000..4bb23b8f168324
--- /dev/null
+++ b/examples/bertology.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+import os
+import argparse
+import logging
+from datetime import timedelta, datetime
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
+
+from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
+
+
+logger = logging.getLogger(__name__)
+
+
+def entropy(p):
+    plogp = p * torch.log(p)
+    plogp[p == 0] = 0
+    return -plogp.sum(dim=-1)
+
+
+def print_1d_tensor(tensor, prefix=""):
+    if tensor.dtype != torch.long:
+        logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
+    else:
+        logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
+
+
+def print_2d_tensor(tensor):
+    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
+    for row in range(len(tensor)):
+        print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
+
+
+def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+    """ Example on how to use model outputs to compute:
+        - head attention entropy (activated by setting output_attentions=True when we created the model
+        - head importance scores according to http://arxiv.org/abs/1905.10650
+            (activated by setting keep_multihead_output=True when we created the model)
+    """
+    # Prepare our tensors
+    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+    preds = None
+    labels = None
+    tot_tokens = 0.0
+
+    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
+        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
+
+        if compute_entropy:
+            # Update head attention entropy
+            for layer, attn in enumerate(all_attentions):
+                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
+
+        if compute_importance:
+            # Update head importance scores with regards to our loss
+            # First, backpropagate to populate the gradients
+            if args.output_mode == "classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+            elif args.output_mode == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), label_ids.view(-1))
+            loss.backward()
+            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650
+            multihead_outputs = model.bert.get_multihead_outputs()
+            for layer, mh_layer_output in enumerate(multihead_outputs):
+                dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
+                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
+
+        # Also store our logits/labels if we want to compute metrics afterwards
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            labels = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+
+        tot_tokens += input_mask.float().detach().sum().data
+
+    # Normalize
+    attn_entropy /= tot_tokens
+    head_importance /= tot_tokens
+    # Layerwise importance normalization
+    if not args.dont_normalize_importance_by_layer:
+        exponent = 2
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
+
+    if not args.dont_normalize_global_importance:
+        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
+
+    return attn_entropy, head_importance, preds, labels
+
+
+def run_model():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
+    parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
+    parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
+
+    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers")
+    parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1")
+
+    parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
+    parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
+                                                                             "(stop masking when metric < threshold * original metric value).")
+    parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.")
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
+
+    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
+
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup devices and distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
+
+    # Set seeds
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed(args.seed)
+
+    # Prepare GLUE task
+    task_name = args.task_name.lower()
+    processor = processors[task_name]()
+    label_list = processor.get_labels()
+    args.output_mode = output_modes[task_name]
+    args.num_labels = len(label_list)
+
+    # Prepare output directory
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    # Load model & tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
+    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
+
+    # Load a model with all BERTology options on:
+    #   output_attentions => will output attention weights
+    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
+    #       see: http://arxiv.org/abs/1905.10650
+    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
+                                                          num_labels=args.num_labels,
+                                                          output_attentions=True,
+                                                          keep_multihead_output=True)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
+    model.eval()
+
+    # Prepare dataset for the GLUE task
+    eval_examples = processor.get_dev_examples(args.data_dir)
+    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
+    try:
+        eval_features = torch.load(cached_eval_features_file)
+    except:
+        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving eval features to cache file %s", cached_eval_features_file)
+            torch.save(eval_features, cached_eval_features_file)
+
+    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float)
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+
+    if args.data_subset > 0:
+        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
+
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    # Print/save training arguments
+    print(args)
+    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+
+    # Compute head entropy and importance score
+    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
+
+    # Print/save matrices
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+
+    logger.info("Attention entropies")
+    print_2d_tensor(attn_entropy)
+    logger.info("Head importance scores")
+    print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+
+    # Do masking if we want to
+    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
+        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
+        logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+
+        new_head_mask = torch.ones_like(head_importance)
+        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
+
+        current_score = original_score
+        while current_score >= original_score * args.masking_threshold:
+            head_mask = new_head_mask.clone() # save current head mask
+            # heads from least important to most - keep only not-masked heads
+            head_importance[head_mask == 0.0] = float('Inf')
+            current_heads_to_mask = head_importance.view(-1).sort()[1]
+
+            if len(current_heads_to_mask) <= num_to_mask:
+                break
+
+            # mask heads
+            current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+            logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
+            new_head_mask = new_head_mask.view(-1)
+            new_head_mask[current_heads_to_mask] = 0.0
+            new_head_mask = new_head_mask.view_as(head_mask)
+            print_2d_tensor(new_head_mask)
+
+            # Compute metric and head importance again
+            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+            preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
+            logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+
+        logger.info("Final head mask")
+        print_2d_tensor(head_mask)
+        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+
+        # Try pruning and test time speedup
+        # Pruning is like masking but we actually remove the masked weights
+        before_time = datetime.now()
+        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
+        original_time = datetime.now() - before_time
+
+        original_num_params = sum(p.numel() for p in model.parameters())
+        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+        model.bert.prune_heads(heads_to_prune)
+        pruned_num_params = sum(p.numel() for p in model.parameters())
+
+        before_time = datetime.now()
+        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                       compute_entropy=False, compute_importance=False, head_mask=None)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
+        new_time = datetime.now() - before_time
+
+        logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+        logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+        logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+
+if __name__ == '__main__':
+    run_model()
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 5c3051f5009718..2a5783c26116ac 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -1,5 +1,6 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import os
 import torch
 import logging
 import json
@@ -12,9 +13,10 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
@@ -268,7 +270,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
@@ -314,8 +317,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps,
-                                                                          args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
@@ -325,8 +327,13 @@ def main():
     # Save a trained model
     logging.info("** ** * Saving fine-tuned model ** ** * ")
     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = args.output_dir / "pytorch_model.bin"
-    torch.save(model_to_save.state_dict(), str(output_model_file))
+    
+    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+    torch.save(model_to_save.state_dict(), output_model_file)
+    model_to_save.config.to_json_file(output_config_file)
+    tokenizer.save_vocabulary(args.output_dir)
 
 
 if __name__ == '__main__':
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index e6c3598a9fecac..8bed1e54d4b474 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -4,11 +4,11 @@
 from tempfile import TemporaryDirectory
 import shelve
 
-from random import random, randrange, randint, shuffle, choice, sample
+from random import random, randrange, randint, shuffle, choice
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 import numpy as np
 import json
-
+import collections
 
 class DocumentDatabase:
     def __init__(self, reduce_memory=False):
@@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
         else:
             trunc_tokens.pop()
 
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
 
-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
+def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
     """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
     with several refactors to clean it up and remove a lot of unnecessary variables."""
     cand_indices = []
     for (i, token) in enumerate(tokens):
         if token == "[CLS]" or token == "[SEP]":
             continue
-        cand_indices.append(i)
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word. When a word has been split into
+        # WordPieces, the first token does not have any marker and any subsequence
+        # tokens are prefixed with ##. So whenever we see the ## token, we
+        # append it to the previous set of word indexes.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
+            cand_indices[-1].append(i)
+        else:
+            cand_indices.append([i])
 
     num_to_mask = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))
     shuffle(cand_indices)
-    mask_indices = sorted(sample(cand_indices, num_to_mask))
-    masked_token_labels = []
-    for index in mask_indices:
-        # 80% of the time, replace with [MASK]
-        if random() < 0.8:
-            masked_token = "[MASK]"
-        else:
-            # 10% of the time, keep original
-            if random() < 0.5:
-                masked_token = tokens[index]
-            # 10% of the time, replace with random word
+    masked_lms = []
+    covered_indexes = set()
+    for index_set in cand_indices:
+        if len(masked_lms) >= num_to_mask:
+            break
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_mask:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if random() < 0.8:
+                masked_token = "[MASK]"
             else:
-                masked_token = choice(vocab_list)
-        masked_token_labels.append(tokens[index])
-        # Once we've saved the true label for that token, we can overwrite it with the masked version
-        tokens[index] = masked_token
+                # 10% of the time, keep original
+                if random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = choice(vocab_list)
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+            tokens[index] = masked_token
+
+    assert len(masked_lms) <= num_to_mask
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    mask_indices = [p.index for p in masked_lms]
+    masked_token_labels = [p.label for p in masked_lms]
 
     return tokens, mask_indices, masked_token_labels
 
 
 def create_instances_from_document(
         doc_database, doc_idx, max_seq_length, short_seq_prob,
-        masked_lm_prob, max_predictions_per_seq, vocab_list):
+        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
     """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
     However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
     Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
@@ -213,7 +248,7 @@ def create_instances_from_document(
                 segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
 
                 tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_list)
+                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)
 
                 instance = {
                     "tokens": tokens,
@@ -235,9 +270,10 @@ def main():
     parser.add_argument("--output_dir", type=Path, required=True)
     parser.add_argument("--bert_model", type=str, required=True,
                         choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual", "bert-base-chinese"])
+                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
     parser.add_argument("--do_lower_case", action="store_true")
-
+    parser.add_argument("--do_whole_word_mask", action="store_true",
+                        help="Whether to use whole word masking rather than per-WordPiece masking.")
     parser.add_argument("--reduce_memory", action="store_true",
                         help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
 
@@ -284,7 +320,7 @@ def main():
                     doc_instances = create_instances_from_document(
                         docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
                         masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
-                        vocab_list=vocab_list)
+                        whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
                     doc_instances = [json.dumps(instance) for instance in doc_instances]
                     for instance in doc_instances:
                         epoch_file.write(instance + '\n')
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 0f8547333082fa..368d6825c73c6c 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,9 +29,10 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
@@ -534,34 +535,37 @@ def main():
         model = torch.nn.DataParallel(model)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
 
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
 
     global_step = 0
     if args.do_train:
@@ -601,7 +605,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
@@ -611,9 +615,12 @@ def main():
         # Save a trained model
         logger.info("** ** * Saving fine - tuned model ** ** * ")
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
         if args.do_train:
             torch.save(model_to_save.state_dict(), output_model_file)
+            model_to_save.config.to_json_file(output_config_file)
+            tokenizer.save_vocabulary(args.output_dir)
 
 
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index b90ac494e4fe37..5a359ad2622712 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -18,547 +18,36 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import csv
 import logging
 import os
-import random
 import sys
+import random
+from tqdm import tqdm, trange
 
 import numpy as np
+
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
 from torch.nn import CrossEntropyLoss, MSELoss
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
+from tensorboardX import SummaryWriter
+
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
-logger = logging.getLogger(__name__)
+from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
 
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
 
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
-            "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = {label : i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        segment_ids = [0] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            segment_ids += [1] * (len(tokens_b) + 1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
 
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = float(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-
-
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-
-
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
+logger = logging.getLogger(__name__)
 
 
 def main():
@@ -629,6 +118,9 @@ def main():
     parser.add_argument("--no_cuda",
                         action='store_true',
                         help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -660,31 +152,6 @@ def main():
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
 
-    processors = {
-        "cola": ColaProcessor,
-        "mnli": MnliProcessor,
-        "mnli-mm": MnliMismatchedProcessor,
-        "mrpc": MrpcProcessor,
-        "sst-2": Sst2Processor,
-        "sts-b": StsbProcessor,
-        "qqp": QqpProcessor,
-        "qnli": QnliProcessor,
-        "rte": RteProcessor,
-        "wnli": WnliProcessor,
-    }
-
-    output_modes = {
-        "cola": "classification",
-        "mnli": "classification",
-        "mrpc": "classification",
-        "sst-2": "classification",
-        "sts-b": "regression",
-        "qqp": "classification",
-        "qnli": "classification",
-        "rte": "classification",
-        "wnli": "classification",
-    }
-
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         n_gpu = torch.cuda.device_count()
@@ -694,6 +161,7 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+    args.device = device
 
     logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                         datefmt = '%m/%d/%Y %H:%M:%S',
@@ -717,9 +185,9 @@ def main():
     if not args.do_train and not args.do_eval:
         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
 
     task_name = args.task_name.lower()
@@ -733,74 +201,49 @@ def main():
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
 
-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = processor.get_train_examples(args.data_dir)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-    # Prepare model
-    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
-    model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=cache_dir,
-              num_labels=num_labels)
     if args.fp16:
         model.half()
     model.to(device)
     if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
-
     global_step = 0
     nb_tr_steps = 0
     tr_loss = 0
+
     if args.do_train:
-        train_features = convert_examples_to_features(
-            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+
+        # Prepare data loader
+        train_examples = processor.get_train_examples(args.data_dir)
+        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name)))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@@ -817,16 +260,55 @@ def main():
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        # Prepare optimizer
+
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
         model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, segment_ids, input_mask, labels=None)
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
@@ -852,13 +334,18 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
 
+    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    ### Example:
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
@@ -874,14 +361,34 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+
     model.to(device)
 
+    ### Evaluation
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         eval_examples = processor.get_dev_examples(args.data_dir)
-        eval_features = convert_examples_to_features(
-            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name)))
+        try:
+            with open(cached_eval_features_file, "rb") as reader:
+                eval_features = pickle.load(reader)
+        except:
+            eval_features = convert_examples_to_features(
+                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
+                with open(cached_eval_features_file, "wb") as writer:
+                    pickle.dump(eval_features, writer)
+
+
         logger.info("***** Running evaluation *****")
         logger.info("  Num examples = %d", len(eval_examples))
         logger.info("  Batch size = %d", args.eval_batch_size)
@@ -896,13 +403,17 @@ def main():
 
         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
         # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
+        if args.local_rank == -1:
+            eval_sampler = SequentialSampler(eval_data)
+        else:
+            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
         eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         model.eval()
         eval_loss = 0
         nb_eval_steps = 0
         preds = []
+        out_label_ids = None
 
         for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
             input_ids = input_ids.to(device)
@@ -911,7 +422,7 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, segment_ids, input_mask, labels=None)
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
             # create eval loss and other metric required by the task
             if output_mode == "classification":
@@ -920,14 +431,17 @@ def main():
             elif output_mode == "regression":
                 loss_fct = MSELoss()
                 tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
-            
+
             eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
             if len(preds) == 0:
                 preds.append(logits.detach().cpu().numpy())
+                out_label_ids = label_ids.detach().cpu().numpy()
             else:
                 preds[0] = np.append(
                     preds[0], logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(
+                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         preds = preds[0]
@@ -935,8 +449,9 @@ def main():
             preds = np.argmax(preds, axis=1)
         elif output_mode == "regression":
             preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, all_label_ids.numpy())
-        loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = compute_metrics(task_name, preds, out_label_ids)
+
+        loss = tr_loss/global_step if args.do_train else None
 
         result['eval_loss'] = eval_loss
         result['global_step'] = global_step
@@ -979,6 +494,7 @@ def main():
             eval_loss = 0
             nb_eval_steps = 0
             preds = []
+            out_label_ids = None
 
             for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                 input_ids = input_ids.to(device)
@@ -987,24 +503,28 @@ def main():
                 label_ids = label_ids.to(device)
 
                 with torch.no_grad():
-                    logits = model(input_ids, segment_ids, input_mask, labels=None)
-            
+                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
+
                 loss_fct = CrossEntropyLoss()
                 tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            
+
                 eval_loss += tmp_eval_loss.mean().item()
                 nb_eval_steps += 1
                 if len(preds) == 0:
                     preds.append(logits.detach().cpu().numpy())
+                    out_label_ids = label_ids.detach().cpu().numpy()
                 else:
                     preds[0] = np.append(
                         preds[0], logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids = np.append(
+                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
 
             eval_loss = eval_loss / nb_eval_steps
             preds = preds[0]
             preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, all_label_ids.numpy())
-            loss = tr_loss/nb_tr_steps if args.do_train else None
+            result = compute_metrics(task_name, preds, out_label_ids)
+
+            loss = tr_loss/global_step if args.do_train else None
 
             result['eval_loss'] = eval_loss
             result['global_step'] = global_step
diff --git a/examples/run_classifier_dataset_utils.py b/examples/run_classifier_dataset_utils.py
new file mode 100644
index 00000000000000..4924afaea25192
--- /dev/null
+++ b/examples/run_classifier_dataset_utils.py
@@ -0,0 +1,571 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import logging
+import os
+import sys
+
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import matthews_corrcoef, f1_score
+
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
+            "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label : i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                    [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              input_mask=input_mask,
+                              segment_ids=segment_ids,
+                              label_id=label_id))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 9e9e2b255bafef..8f8208bbcd54cd 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -107,25 +107,23 @@ def run_model():
                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                     print(text)
             print("=" * 80)
-      if args.unconditional:
-          generated = 0
-          for _ in range(args.nsamples // args.batch_size):
-              out = sample_sequence(
-                  model=model, length=args.length,
-                  context=None,
-                  start_token=enc.encoder['<|endoftext|>'],
-                  batch_size=args.batch_size,
-                  temperature=args.temperature, top_k=args.top_k, device=device
-              )
-              out = out[:,1:].tolist()
-              for i in range(args.batch_size):
-                  generated += 1
-                  text = enc.decode(out[i])
-                  print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                  print(text)
-          print("=" * 80)
-          if args.unconditional:
-              break
+        else:
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=None,
+                    start_token=enc.encoder['<|endoftext|>'],
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:,1:].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
 
 if __name__ == '__main__':
     run_model()
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index cb5aa8d9cbdd3b..ac5c4744916a67 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -83,8 +83,8 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
             input_ids[i, 1, :len(with_cont2)] = with_cont2
             mc_token_ids[i, 0] = len(with_cont1) - 1
             mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
-            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            lm_labels[i, 0, :len(with_cont1)] = with_cont1
+            lm_labels[i, 1, :len(with_cont2)] = with_cont2
             mc_labels[i] = mc_label
         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
@@ -183,19 +183,20 @@ def tokenize_and_encode(obj):
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
-    optimizer = OpenAIAdam(optimizer_grouped_parameters,
-                           lr=args.learning_rate,
-                           warmup=args.warmup_proportion,
-                           max_grad_norm=args.max_grad_norm,
-                           weight_decay=args.weight_decay,
-                           t_total=num_train_optimization_steps)
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
+        optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                               lr=args.learning_rate,
+                               warmup=args.warmup_proportion,
+                               max_grad_norm=args.max_grad_norm,
+                               weight_decay=args.weight_decay,
+                               t_total=num_train_optimization_steps)
 
     if args.do_train:
         nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 410fd8529880ed..bf1763e884a463 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -18,10 +18,7 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import collections
-import json
 import logging
-import math
 import os
 import random
 import sys
@@ -34,12 +31,14 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
-from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
-                                                  BertTokenizer,
-                                                  whitespace_tokenize)
+from tensorboardX import SummaryWriter
+
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+from run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -49,717 +48,6 @@
 logger = logging.getLogger(__name__)
 
 
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            start_position = None
-            end_position = None
-            if is_training and not example.is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
-                    logger.info("impossible example")
-                if is_training and not example.is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=example.is_impossible))
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    logger.info("Writing predictions to: %s" % (output_prediction_file))
-    logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-                
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
-                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-            all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
 def main():
     parser = argparse.ArgumentParser()
 
@@ -823,6 +111,9 @@ def main():
     parser.add_argument('--fp16',
                         action='store_true',
                         help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
     parser.add_argument('--loss_scale',
                         type=float, default=0,
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
@@ -887,79 +178,37 @@ def main():
             raise ValueError(
                 "If `do_predict` is True, then `predict_file` must be specified.")
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory () already exists and is not empty.")
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-    # Prepare model
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
 
     if args.fp16:
         model.half()
     model.to(device)
     if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-
-    # hack to remove pooler, which is not used
-    # thus it produce None grad that break apex
-    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
-
-    global_step = 0
     if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+        # Prepare data loader
+        train_examples = read_squad_examples(
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
         cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
             list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        train_features = None
         try:
             with open(cached_train_features_file, "rb") as reader:
                 train_features = pickle.load(reader)
@@ -975,11 +224,7 @@ def main():
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
                 with open(cached_train_features_file, "wb") as writer:
                     pickle.dump(train_features, writer)
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
+
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@@ -991,10 +236,58 @@ def main():
             train_sampler = RandomSampler(train_data)
         else:
             train_sampler = DistributedSampler(train_data)
+
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        # if args.local_rank != -1:
+        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num orig examples = %d", len(train_examples))
+        logger.info("  Num split examples = %d", len(train_features))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
 
         model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                 if n_gpu == 1:
                     batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
@@ -1013,12 +306,15 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
 
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
@@ -1035,6 +331,10 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForQuestionAnswering.from_pretrained(args.output_dir)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/examples/run_squad_dataset_utils.py b/examples/run_squad_dataset_utils.py
new file mode 100644
index 00000000000000..4043ee57f81401
--- /dev/null
+++ b/examples/run_squad_dataset_utils.py
@@ -0,0 +1,740 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Load SQuAD dataset. """
+
+from __future__ import absolute_import, division, print_function
+
+import json
+import logging
+import math
+import collections
+from io import open
+
+from pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize
+
+logger = logging.getLogger(__name__)
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset.
+    For examples without an answer, the start and end position are -1.
+    """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (self.qas_id)
+        s += ", question_text: %s" % (
+            self.question_text)
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.end_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.is_impossible:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r", encoding='utf-8') as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer.")
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.warning("Could not find answer: '%s' vs. '%s'",
+                                           actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+    return examples
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length, is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+            if example_index < 20:
+                logger.info("*** Example ***")
+                logger.info("unique_id: %s" % (unique_id))
+                logger.info("example_index: %s" % (example_index))
+                logger.info("doc_span_index: %s" % (doc_span_index))
+                logger.info("tokens: %s" % " ".join(tokens))
+                logger.info("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
+                logger.info("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
+                ]))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info(
+                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    logger.info("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    logger.info("start_position: %d" % (start_position))
+                    logger.info("end_position: %d" % (end_position))
+                    logger.info(
+                        "answer: %s" % (answer_text))
+
+            features.append(
+                InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible))
+            unique_id += 1
+
+    return features
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+                      version_2_with_negative, null_score_diff_threshold):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+                
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest)==1:
+                nbest.insert(0,
+                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
diff --git a/examples/run_swag.py b/examples/run_swag.py
index a6cfdbe311d578..28fd323c7373aa 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -34,7 +34,7 @@
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -358,15 +358,6 @@ def main():
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
     # Prepare model
     model = BertForMultipleChoice.from_pretrained(args.bert_model,
         cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
@@ -384,47 +375,13 @@ def main():
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-
-    # hack to remove pooler, which is not used
-    # thus it produce None grad that break apex
-    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+    if args.do_train:
 
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+        # Prepare data loader
 
-    global_step = 0
-    if args.do_train:
+        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
         train_features = convert_examples_to_features(
             train_examples, tokenizer, args.max_seq_length, True)
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
         all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
         all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
         all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
@@ -436,6 +393,53 @@ def main():
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
@@ -464,7 +468,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
@@ -537,7 +541,7 @@ def main():
         result = {'eval_loss': eval_loss,
                   'eval_accuracy': eval_accuracy,
                   'global_step': global_step,
-                  'loss': tr_loss/nb_tr_steps}
+                  'loss': tr_loss/global_step}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
diff --git a/hubconf.py b/hubconf.py
new file mode 100644
index 00000000000000..f8336207802f3b
--- /dev/null
+++ b/hubconf.py
@@ -0,0 +1,30 @@
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
+
+from hubconfs.bert_hubconf import (
+    bertTokenizer,
+    bertModel,
+    bertForNextSentencePrediction,
+    bertForPreTraining,
+    bertForMaskedLM,
+    bertForSequenceClassification,
+    bertForMultipleChoice,
+    bertForQuestionAnswering,
+    bertForTokenClassification
+)
+from hubconfs.gpt_hubconf import (
+    openAIGPTTokenizer,
+    openAIGPTModel,
+    openAIGPTLMHeadModel,
+    openAIGPTDoubleHeadsModel
+)
+from hubconfs.gpt2_hubconf import (
+    gpt2Tokenizer,
+    gpt2Model,
+    gpt2LMHeadModel,
+    gpt2DoubleHeadsModel
+)
+from hubconfs.transformer_xl_hubconf import (
+    transformerXLTokenizer,
+    transformerXLModel,
+    transformerXLLMHeadModel
+)
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
new file mode 100644
index 00000000000000..3769c2567f51cc
--- /dev/null
+++ b/hubconfs/bert_hubconf.py
@@ -0,0 +1,360 @@
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling import (
+        BertModel,
+        BertForNextSentencePrediction,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        )
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+bert_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+                . `bert-base-german-cased`
+                . `bert-large-uncased-whole-word-masking`
+                . `bert-large-cased-whole-word-masking`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
+                  instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow
+                 checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models
+                   will be cached.
+        state_dict: an optional state dictionnary
+                    (collections.OrderedDict object) to use instead of Google
+                    pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def bertTokenizer(*args, **kwargs):
+    """
+    Instantiate a BertTokenizer from a pre-trained/customized vocab file
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * bert-base-uncased
+                                       * bert-large-uncased
+                                       * bert-base-cased
+                                       * bert-large-cased
+                                       * bert-base-multilingual-uncased
+                                       * bert-base-multilingual-cased
+                                       * bert-base-chinese
+    Keyword args:
+    cache_dir: an optional path to a specific directory to download and cache
+               the pre-trained model weights.
+               Default: None
+    do_lower_case: Whether to lower case the input.
+                   Only has an effect when do_wordpiece_only=False
+                   Default: True
+    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+                       Default: True
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+    never_split: List of tokens which will never be split during tokenization.
+                 Only has an effect when do_wordpiece_only=False
+                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
+
+    Example:
+        >>> import torch
+        >>> sentence = 'Hello, World!'
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> toks = tokenizer.tokenize(sentence)
+        ['Hello', '##,', 'World', '##!']
+        >>> ids = tokenizer.convert_tokens_to_ids(toks)
+        [8667, 28136, 1291, 28125]
+    """
+    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertModel(*args, **kwargs):
+    """
+    BertModel is the basic BERT Transformer model with a layer of summed token,
+    position and sequence embeddings followed by a series of identical
+    self-attention blocks (12 for BERT-base, 24 for BERT-large).
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model.eval()
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                encoded_layers, _ = model(tokens_tensor, segments_tensors)
+    """
+    model = BertModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForNextSentencePrediction(*args, **kwargs):
+    """
+    BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence
+    classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForNextSentencePrediction
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the next sentence classification logits
+        >>> with torch.no_grad():
+                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
+    """
+    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForPreTraining(*args, **kwargs):
+    """
+    BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForPreTraining
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
+    """
+    model = BertForPreTraining.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMaskedLM(*args, **kwargs):
+    """
+    BertForMaskedLM includes the BertModel Transformer followed by the
+    (possibly) pre-trained masked language modeling head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> masked_index = 8
+        >>> tokenized_text[masked_index] = '[MASK]'
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForMaskedLM
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model.eval()
+        # Predict all tokens
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor, segments_tensors)
+        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        'henson'
+    """
+    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForSequenceClassification(*args, **kwargs):
+    """
+    BertForSequenceClassification is a fine-tuning model that includes
+    BertModel and a sequence-level (sequence or pair of sequences) classifier
+    on top of the BertModel. Note that the classification head is only initialized
+    and has to be trained.
+
+    The sequence-level classifier is a linear layer that takes as input the
+    last hidden state of the first character in the input sequence
+    (see Figures 3a and 3b in the BERT paper).
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForSequenceClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the sequence classification logits
+        >>> with torch.no_grad():
+                seq_classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the sequence classification loss
+        >>> labels = torch.tensor([1])
+        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMultipleChoice(*args, **kwargs):
+    """
+    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
+    linear layer on top of the BertModel. Note that the multiple choice head is
+    only initialized and has to be trained.
+
+    Args:
+    num_choices: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
+        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
+        # Load bertForMultipleChoice
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model.eval()
+        # Predict the multiple choice logits
+        >>> with torch.no_grad():
+                multiple_choice_logits = model(tokens_tensor, segments_tensors)
+        # Or get the multiple choice loss
+        >>> labels = torch.tensor([1])
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForQuestionAnswering(*args, **kwargs):
+    """
+    BertForQuestionAnswering is a fine-tuning model that includes BertModel
+    with a token-level classifiers on top of the full sequence of last hidden
+    states. Note that the classification head is only initialized
+    and has to be trained.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForQuestionAnswering
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the start and end positions logits
+        >>> with torch.no_grad():
+                start_logits, end_logits = model(tokens_tensor, segments_tensors)
+        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
+        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
+        # set model.train() before if training this loss
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
+    """
+    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForTokenClassification(*args, **kwargs):
+    """
+    BertForTokenClassification is a fine-tuning model that includes BertModel
+    and a token-level classifier on top of the BertModel. Note that the classification
+    head is only initialized and has to be trained.
+
+    The token-level classifier is a linear layer that takes as input the last
+    hidden state of the sequence.
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForTokenClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the token classification logits
+        >>> with torch.no_grad():
+                classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the token classification loss
+        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
+        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
+    return model
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
new file mode 100644
index 00000000000000..3ac8bc72ab57fc
--- /dev/null
+++ b/hubconfs/gpt2_hubconf.py
@@ -0,0 +1,168 @@
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_pretrained_bert.modeling_gpt2 import (
+    GPT2Model,
+    GPT2LMHeadModel,
+    GPT2DoubleHeadsModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt2_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `gpt2`, `gpt2-medium`
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . a TensorFlow checkpoint with trained weights
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific GPT-2 class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def gpt2Tokenizer(*args, **kwargs):
+    """
+    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
+    Peculiarities:
+        - Byte-level BPE
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * gpt2
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2Model(*args, **kwargs):
+    """
+    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
+    identical stacked masked self-attention blocks and pre-trained
+    on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2Model
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+                hidden_states_1, past = model(tokens_tensor_1)
+                hidden_states_2, past = model(tokens_tensor_2, past=past)
+    """
+    model = GPT2Model.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2LMHeadModel(*args, **kwargs):
+    """
+    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
+    tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2LMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+                predictions_1, past = model(tokens_tensor_1)
+                predictions_2, past = model(tokens_tensor_2, past=past)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2DoubleHeadsModel(*args, **kwargs):
+    """
+    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
+    tied (pre-trained) language modeling head and a multiple choice
+    classification head (only initialized, not pre-trained).
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+        # Load gpt2DoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
+    """
+    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
new file mode 100644
index 00000000000000..f3d03888aec137
--- /dev/null
+++ b/hubconfs/gpt_hubconf.py
@@ -0,0 +1,186 @@
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.modeling_openai import (
+	OpenAIGPTModel,
+	OpenAIGPTLMHeadModel,
+	OpenAIGPTDoubleHeadsModel
+)
+
+# Dependecies that are not specified in global hubconf.py
+specific_dependencies = ['spacy', 'ftfy']
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt_docstring = """
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+		pretrained_model_name_or_path: either:
+			- a str with the name of a pre-trained model to load selected in the list of:
+				. `openai-gpt`
+			- a path or url to a pretrained model archive containing:
+				. `openai_gpt_config.json` a configuration file for the model
+				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+			- a path or url to a pretrained model archive containing:
+				. `openai-gpt-config.json` a configuration file for the model
+				. a series of NumPy files containing OpenAI TensorFlow trained weights
+		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+		state_dict: an optional state dictionnary (collections.OrderedDict object)
+		        	to use instead of pre-trained models
+		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def openAIGPTTokenizer(*args, **kwargs):
+    """
+    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
+	Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * openai-gpt
+    Keyword args:
+	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+					Default: None
+	max_len: An artificial maximum length to truncate tokenized sequences to;
+        	 Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+			 Default: None
+
+    Example:
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+		
+		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
+    """
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTModel(*args, **kwargs):
+    """
+    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
+	identical stacked masked self-attention blocks and pre-trained
+	on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states = model(tokens_tensor)
+    """
+    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTLMHeadModel(*args, **kwargs):
+    """
+    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head on top.
+
+	Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor)
+
+		# Get the predicted last token
+		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
+		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        '.</w>'
+    """
+    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTDoubleHeadsModel(*args, **kwargs):
+    """
+    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head and a multiple choice
+	classification head (only initialized, not pre-trained).
+
+	Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+        # Load openAIGPTDoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+    """
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
new file mode 100644
index 00000000000000..d5c697547e7e6e
--- /dev/null
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -0,0 +1,130 @@
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_pretrained_bert.modeling_transfo_xl import (
+    TransfoXLModel,
+    TransfoXLLMHeadModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+transformer_xl_docstring = """
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `transfo-xl-wt103`
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific TransformerXL class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def transformerXLTokenizer(*args, **kwargs):
+    """
+    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * transfo-xl-wt103
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        
+        >>> text = "Who was Jim Henson ?"
+        >>> tokenized_text = tokenizer.tokenize(tokenized_text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    """
+    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLModel(*args, **kwargs):
+    """
+    transformerXLModel is the basic Transformer XL model.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                hidden_states_1, mems_1 = model(tokens_tensor_1)
+                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+    """
+    model = TransfoXLModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLLMHeadModel(*args, **kwargs):
+    """
+    transformerXLModel is the basic Transformer XL model with the
+    tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                predictions_1, mems_1 = model(tokens_tensor_1)
+                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        >>> assert predicted_token == 'who'
+    """
+    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 28d215d8bdfc5b..508988322ba298 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
@@ -15,7 +15,7 @@
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
                                   load_tf_weights_in_transfo_xl)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
                             load_tf_weights_in_gpt2)
 
 from .optimization import BertAdam
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 17bdd258eaecd8..605c8412353e34 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -22,6 +22,15 @@
 from botocore.exceptions import ClientError
 from tqdm import tqdm
 
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv('TORCH_HOME', os.path.join(
+            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+
 try:
     from urllib.parse import urlparse
 except ImportError:
@@ -29,11 +38,11 @@
 
 try:
     from pathlib import Path
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                   Path.home() / '.pytorch_pretrained_bert'))
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
 except (AttributeError, ImportError):
     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+                                              default_cache_path)
 
 CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 55c6d9d611629e..c0ab2dd681b487 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -22,9 +22,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
@@ -37,17 +34,63 @@
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -145,7 +188,8 @@ def __init__(self,
                  attention_probs_dropout_prob=0.1,
                  max_position_embeddings=512,
                  type_vocab_size=2,
-                 initializer_range=0.02):
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12):
         """Constructs BertConfig.
 
         Args:
@@ -169,6 +213,7 @@ def __init__(self,
                 `BertModel`.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -188,6 +233,7 @@ def __init__(self,
             self.max_position_embeddings = max_position_embeddings
             self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -254,7 +300,7 @@ def __init__(self, config):
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, input_ids, token_type_ids=None):
@@ -275,12 +321,16 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -296,7 +346,7 @@ def transpose_for_scores(self, x):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask):
+    def forward(self, hidden_states, attention_mask, head_mask=None):
         mixed_query_layer = self.query(hidden_states)
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
@@ -318,10 +368,20 @@ def forward(self, hidden_states, attention_mask):
         # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
         context_layer = torch.matmul(attention_probs, value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
         return context_layer
 
 
@@ -329,7 +389,7 @@ class BertSelfOutput(nn.Module):
     def __init__(self, config):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -340,14 +400,37 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
         self.output = BertSelfOutput(config)
 
-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
         attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
         return attention_output
 
 
@@ -370,7 +453,7 @@ class BertOutput(nn.Module):
     def __init__(self, config):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -381,33 +464,47 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                               keep_multihead_output=keep_multihead_output)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
         return layer_output
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
         all_encoder_layers = []
-        for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states, attention_mask)
+        all_attentions = []
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
             if output_all_encoded_layers:
                 all_encoder_layers.append(hidden_states)
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
         return all_encoder_layers
 
 
@@ -434,7 +531,7 @@ def __init__(self, config):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -523,8 +620,7 @@ def init_bert_weights(self, module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -539,6 +635,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
                     . `bert-base-multilingual-uncased`
                     . `bert-base-multilingual-cased`
                     . `bert-base-chinese`
+                    . `bert-base-german-cased`
+                    . `bert-large-uncased-whole-word-masking`
+                    . `bert-large-cased-whole-word-masking`
                 - a path or url to a pretrained model archive containing:
                     . `bert_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
@@ -551,56 +650,74 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = pretrained_model_name_or_path
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
             return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(
+            logger.info("loading weights file {} from cache at {}".format(
                 archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
         # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        if not os.path.exists(config_file):
-            # Backward compatibility with old naming format
-            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
+        config = BertConfig.from_json_file(resolved_config_file)
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu')
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
             return load_tf_weights_in_bert(model, weights_path)
         # Load from a PyTorch state_dict
         old_keys = []
@@ -653,7 +770,10 @@ class BertModel(BertPreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -667,6 +787,9 @@ class BertModel(BertPreTrainedModel):
             input sequence length in the current batch. It's the mask that we typically use for attention when
             a batch has varying length sentences.
         `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
 
     Outputs: Tuple of (encoded_layers, pooled_output)
         `encoded_layers`: controled by `output_all_encoded_layers` argument:
@@ -693,14 +816,29 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertModel, self).__init__(config)
+        self.output_attentions = output_attentions
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions,
+                                           keep_multihead_output=keep_multihead_output)
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -721,14 +859,34 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
         embedding_output = self.embeddings(input_ids, token_type_ids)
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers)
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
             encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
         return encoded_layers, pooled_output
 
 
@@ -739,7 +897,10 @@ class BertForPreTraining(BertPreTrainedModel):
         - the next sentence classification head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -758,6 +919,8 @@ class BertForPreTraining(BertPreTrainedModel):
         `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
             with indices selected in [0, 1].
             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `masked_lm_labels` and `next_sentence_label` are not `None`:
@@ -782,15 +945,21 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForPreTraining, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
         if masked_lm_labels is not None and next_sentence_label is not None:
@@ -799,8 +968,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
             return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -808,7 +978,10 @@ class BertForMaskedLM(BertPreTrainedModel):
     This module comprises the BERT model followed by the masked language modeling head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -824,6 +997,12 @@ class BertForMaskedLM(BertPreTrainedModel):
         `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `masked_lm_labels` is  not `None`:
@@ -845,23 +1024,31 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                       output_all_encoded_layers=False,
+                                       head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         prediction_scores = self.cls(sequence_output)
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             return masked_lm_loss
-        else:
-            return prediction_scores
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return prediction_scores
 
 
 class BertForNextSentencePrediction(BertPreTrainedModel):
@@ -869,7 +1056,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     This module comprises the BERT model followed by the next sentence classification head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -885,6 +1075,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
             with indices selected in [0, 1].
             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `next_sentence_label` is not `None`:
@@ -907,23 +1099,31 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyNSPHead(config)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
-        seq_relationship_score = self.cls( pooled_output)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                     output_all_encoded_layers=False,
+                                     head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)
 
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             return next_sentence_loss
-        else:
-            return seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -932,7 +1132,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
     the pooled output.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -948,6 +1151,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -971,16 +1176,22 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
         super(BertForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
@@ -988,8 +1199,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
 
 
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -998,7 +1210,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
     the pooled output.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_choices`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1014,6 +1229,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -1036,19 +1253,25 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices):
+    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
         super(BertForMultipleChoice, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_choices = num_choices
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
@@ -1057,8 +1280,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
             return loss
-        else:
-            return reshaped_logits
+        elif self.output_attentions:
+            return all_attentions, reshaped_logits
+        return reshaped_logits
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1067,7 +1291,10 @@ class BertForTokenClassification(BertPreTrainedModel):
     the full hidden state of the last layer.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1083,6 +1310,8 @@ class BertForTokenClassification(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -1106,16 +1335,22 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
         super(BertForTokenClassification, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
@@ -1130,8 +1365,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1140,7 +1376,10 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     the sequence output that computes start_logits and end_logits
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -1159,6 +1398,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
             into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `start_positions` and `end_positions` are not `None`:
@@ -1181,16 +1422,23 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                                       output_all_encoded_layers=False,
+                                                       head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
@@ -1212,5 +1460,6 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_pos
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
             return total_loss
-        else:
-            return start_logits, end_logits
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 063c525d98cd91..c17a7e7d268963 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -23,9 +23,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
@@ -39,8 +36,34 @@
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
+PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+
 
 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -107,18 +130,24 @@ class GPT2Config(object):
     def __init__(
         self,
         vocab_size_or_config_json_file=50257,
+        n_special=0,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
         n_layer=12,
         n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
+        predict_special_tokens=True
     ):
         """Constructs GPT2Config.
 
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -126,8 +155,14 @@ def __init__(
             n_head: Number of attention heads for each attention layer in
                 the Transformer encoder.
             layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -137,19 +172,28 @@ def __init__(
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
             self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
                 "or the path to a pretrained model config file (str)"
             )
 
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `GPT2Config` from a Python dictionary of parameters."""
@@ -200,7 +244,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -209,10 +253,33 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
-
-    def _attn(self, q, k, v):
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -221,6 +288,14 @@ def _attn(self, q, k, v):
         w = w * b - 1e4 * (1 - b)
 
         w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
         return torch.matmul(w, v)
 
     def merge_heads(self, x):
@@ -236,7 +311,7 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
 
-    def forward(self, x, layer_past=None):
+    def forward(self, x, layer_past=None, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
@@ -247,9 +322,19 @@ def forward(self, x, layer_past=None):
             key = torch.cat((past_key, key), dim=-1)
             value = torch.cat((past_value, value), dim=-2)
         present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
+        if self.output_attentions:
+            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a, present
         return a, present
 
 
@@ -260,27 +345,35 @@ def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         self.c_fc = Conv1D(n_state, nx)
         self.c_proj = Conv1D(nx, n_state)
         self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)
 
     def forward(self, x):
         h = self.act(self.c_fc(x))
         h2 = self.c_proj(h)
-        return h2
+        return self.dropout(h2)
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Block, self).__init__()
         nx = config.n_embd
+        self.output_attentions = output_attentions
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
-    def forward(self, x, layer_past=None):
-        a, present = self.attn(self.ln_1(x), layer_past=layer_past)
+    def forward(self, x, layer_past=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+        if self.output_attentions:
+            attentions, a, present = output_attn
+        else:
+            a, present = output_attn
         x = x + a
         m = self.mlp(self.ln_2(x))
         x = x + m
+        if self.output_attentions:
+            return attentions, x, present
         return x, present
 
 
@@ -290,17 +383,20 @@ class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
         self.n_embd = config.n_embd
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights):
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
         lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
         return lm_logits
 
 
@@ -310,6 +406,7 @@ class GPT2MultipleChoiceHead(nn.Module):
     def __init__(self, config):
         super(GPT2MultipleChoiceHead, self).__init__()
         self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(config.n_embd, 1)
 
         nn.init.normal_(self.linear.weight, std=0.02)
@@ -323,6 +420,7 @@ def forward(self, hidden_states, mc_token_ids):
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
         multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
         # (bsz, num_choices)
         return multiple_choice_logits
@@ -345,9 +443,6 @@ def __init__(self, config, *inputs, **kwargs):
             )
         self.config = config
 
-    def set_tied(self):
-        pass
-
     def init_weights(self, module):
         """ Initialize the weights.
         """
@@ -362,9 +457,7 @@ def init_weights(self, module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -382,8 +475,17 @@ def from_pretrained(
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT class
+            *inputs, **kwargs: additional input for the specific GPT2 class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+        num_special_tokens = kwargs.get('num_special_tokens', None)
+        kwargs.pop('num_special_tokens', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@@ -393,16 +495,37 @@ def from_pretrained(
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
                 )
-            )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
@@ -475,16 +598,37 @@ def load(module, prefix=""):
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
 
-        # Make sure we are still sharing the output and input embeddings after loading weights
-        model.set_tied()
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
         return model
 
 
 class GPT2Model(GPT2PreTrainedModel):
     """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
 
+    GPT-2 use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -499,10 +643,12 @@ class GPT2Model(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs a tuple consisting of:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
             (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
         `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
             torch.FloatTensors. They can be reused to speed up sequential decoding.
@@ -519,17 +665,47 @@ class GPT2Model(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2Model, self).__init__(config)
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.output_attentions = output_attentions
+        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
-        block = Block(config.n_ctx, config, scale=True)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.wte
+        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.wte.to(old_embed.weight.device)
+        self.init_weights(self.wte)
+        # Copy word embeddings from the previous weights
+        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -539,6 +715,20 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
             position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
@@ -551,20 +741,38 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
         presents = []
-        for block, layer_past in zip(self.h, past):
-            hidden_states, present = block(hidden_states, layer_past)
+        all_attentions = []
+        all_hidden_states = []
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            all_hidden_states.append(hidden_states.view(*output_shape))
+            outputs = block(hidden_states, layer_past, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states, present = outputs
+                all_attentions.append(attentions)
+            else:
+                hidden_states, present = outputs
             presents.append(present)
         hidden_states = self.ln_f(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-        return hidden_states.view(*output_shape), presents
+        all_hidden_states.append(hidden_states.view(*output_shape))
+
+        if self.output_attentions:
+            return all_attentions, all_hidden_states, presents
+        return all_hidden_states, presents
 
 
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
 
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -582,6 +790,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` is not `None`:
@@ -604,30 +814,41 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
-
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, presents
         return lm_logits, presents
 
 
@@ -635,7 +856,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
 
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@@ -657,6 +881,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` and `multiple_choice_labels` are not `None`:
@@ -675,37 +901,49 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     config = modeling_gpt2.GPT2Config()
 
-    model = modeling_gpt2.GPT2LMHeadModel(config)
+    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
     lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1,
-                          shift_logits.size(-1)), shift_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
         if losses:
             return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits, presents
         return lm_logits, mc_logits, presents
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f956462ddbfff0..f02f016ef1cb58 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -23,9 +23,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
@@ -36,6 +33,7 @@
 
 from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 from .modeling import BertLayerNorm as LayerNorm
+from .modeling_gpt2 import prune_conv1d_layer
 
 logger = logging.getLogger(__name__)
 
@@ -143,6 +141,7 @@ def __init__(
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
+        predict_special_tokens=True
     ):
         """Constructs OpenAIGPTConfig.
 
@@ -165,6 +164,7 @@ def __init__(
             layer_norm_epsilon: epsilon to use in the layer norm layers
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -186,6 +186,7 @@ def __init__(
             self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
@@ -253,7 +254,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -262,12 +263,33 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.c_attn = Conv1D(n_state * 3, 1, nx)
         self.c_proj = Conv1D(n_state, 1, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
-    def _attn(self, q, k, v):
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -278,6 +300,13 @@ def _attn(self, q, k, v):
 
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
         return torch.matmul(w, v)
 
     def merge_heads(self, x):
@@ -293,16 +322,25 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)
 
-    def forward(self, x):
+    def forward(self, x, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
+        if self.output_attentions:
+            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
         a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a
         return a
 
 
@@ -322,19 +360,24 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Block, self).__init__()
         nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.output_attentions = output_attentions
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
-    def forward(self, x):
-        a = self.attn(x)
+    def forward(self, x, head_mask=None):
+        a = self.attn(x, head_mask=head_mask)
+        if self.output_attentions:
+            attentions, a = a
         n = self.ln_1(x + a)
         m = self.mlp(n)
         h = self.ln_2(n + m)
+        if self.output_attentions:
+            return attentions, h
         return h
 
 
@@ -344,17 +387,21 @@ class OpenAIGPTLMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
-    def set_embeddings_weights(self, model_embeddings_weights):
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
         embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
         lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
         return lm_logits
 
 
@@ -364,7 +411,6 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
     def __init__(self, config):
         super(OpenAIGPTMultipleChoiceHead, self).__init__()
         self.n_embd = config.n_embd
-        # self.multiple_choice_token = multiple_choice_token
         self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(config.n_embd, 1)
 
@@ -415,13 +461,8 @@ def init_weights(self, module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -434,14 +475,20 @@ def from_pretrained(
                     . `openai_gpt_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                 - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `openai-gpt-config.json` a configuration file for the model
                     . a series of NumPy files containing OpenAI TensorFlow trained weights
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@@ -451,16 +498,37 @@ def from_pretrained(
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
                 )
-            )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
@@ -560,7 +628,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -572,10 +643,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             (the previous two being the word and position embeddings).
             The input, position and token_type embeddings are summed inside the Transformer before the first
             self-attention block.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
             (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
     Example usage:
@@ -590,17 +663,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTModel, self).__init__(config)
-        num_tokens = config.vocab_size + config.n_special
-        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.output_attentions = output_attentions
+        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
-        # nn.init.normal_(self.embed.weight, std=0.02)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input embeddings with new embedding matrice if needed "
@@ -616,7 +689,20 @@ def set_num_special_tokens(self, num_special_tokens):
         # Copy word embeddings from the previous weights
         self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
             # start = self.config.vocab_size + self.config.n_special
@@ -625,6 +711,20 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
             position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
@@ -636,13 +736,25 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
             token_type_embeds = self.tokens_embed(token_type_ids)
         else:
             token_type_embeds = 0
-        # Add the position information to the input embeddings
-        # h = e.sum(dim=2)
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        for block in self.h:
-            hidden_states = block(hidden_states)
+        hidden_states = self.drop(hidden_states)
+
         output_shape = input_shape + (hidden_states.size(-1),)
-        return hidden_states.view(*output_shape)
+
+        all_attentions = []
+        all_hidden_states = [hidden_states.view(*output_shape)]
+        for i, block in enumerate(self.h):
+            outputs = block(hidden_states, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states = outputs
+                all_attentions.append(attentions)
+            else:
+                hidden_states = outputs
+            all_hidden_states.append(hidden_states.view(*output_shape))
+
+        if self.output_attentions:
+            return all_attentions, all_hidden_states
+        return all_hidden_states
 
 
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
@@ -666,7 +778,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -681,6 +796,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` is not `None`:
@@ -701,21 +818,27 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
@@ -726,6 +849,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits
         return lm_logits
 
 
@@ -750,7 +875,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@@ -769,6 +897,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             is only computed for the labels set in [0, ..., total_tokens_embeddings]
         `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` and `multiple_choice_labels` are not `None`:
@@ -785,27 +915,34 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     config = modeling_openai.OpenAIGPTConfig()
 
-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
     lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
@@ -819,4 +956,6 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
         if losses:
             return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
         return lm_logits, mc_logits
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index e8fffc5b60894b..534a111c7781fc 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -25,9 +25,6 @@
 import json
 import math
 import logging
-import tarfile
-import tempfile
-import shutil
 import collections
 import sys
 from io import open
@@ -888,8 +885,7 @@ def set_num_special_tokens(self, num_special_tokens):
         pass
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -897,19 +893,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
         Params:
             pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
-                    . `transfo-xl`
+                    . `transfo-xl-wt103`
                 - a path or url to a pretrained model archive containing:
                     . `transfo_xl_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
                 - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `transfo_xl_config.json` a configuration file for the model
                     . `model.chkpt` a TensorFlow checkpoint
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific TransformerXL class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@@ -919,16 +921,37 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    archive_file, config_file))
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
+                )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 647ba7774c1522..7fd67adb35879e 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -114,10 +114,10 @@ def forward(self, hidden, target=None, keep_order=False):
             logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                         self.out_layers[0].bias, self.out_projs[0])
             if target is not None:
-                output = -F.log_softmax(logit, dim=-1) \
+                out = -F.log_softmax(logit, dim=-1) \
                         .gather(1, target.unsqueeze(1)).squeeze(1)
             else:
-                output = F.log_softmax(logit, dim=-1)
+                out = F.log_softmax(logit, dim=-1)
         else:
             # construct weights and biases
             weights, biases = [], []
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index aa59c7d7ecf717..03856956ace692 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -20,33 +20,163 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
+import abc
+import sys
 
 logger = logging.getLogger(__name__)
 
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    x_ = (x - warmup) / (1 - warmup)  # progress after warmup -
-    return 0.5 * (1. + math.cos(math.pi * x_))
-
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x/warmup
-    return 1.0
-
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x/warmup
-    return max((x-1.)/(warmup-1.), 0)
+
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+
+
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+
+
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+
+
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+
+
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+
+
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+
 
 SCHEDULES = {
-    'warmup_cosine':   warmup_cosine,
-    'warmup_constant': warmup_constant,
-    'warmup_linear':   warmup_linear,
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
 }
 
 
@@ -56,8 +186,11 @@ class BertAdam(Optimizer):
         lr: learning rate
         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
         t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate. Default: -1
-        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
         b1: Adams b1. Default: 0.9
         b2: Adams b2. Default: 0.999
         e: Adams epsilon. Default: 1e-6
@@ -65,21 +198,26 @@ class BertAdam(Optimizer):
         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
     """
     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
-                 max_grad_norm=1.0):
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
         if not 0.0 <= b2 < 1.0:
             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
@@ -91,11 +229,8 @@ def get_lr(self):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
                 lr.append(lr_scheduled)
         return lr
 
@@ -110,8 +245,6 @@ def step(self, closure=None):
         if closure is not None:
             loss = closure()
 
-        warned_for_t_total = False
-
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
@@ -153,19 +286,8 @@ def step(self, closure=None):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step']/group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
-                    # warning for exceeding t_total (only active with warmup_linear
-                    if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
-                        logger.warning(
-                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
-                            "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
-                        warned_for_t_total = True
-                    # end warning
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 update_with_lr = lr_scheduled * update
                 p.data.add_(-update_with_lr)
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 99ac15e1089a51..bff4ebe61f462f 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -20,35 +20,11 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
+from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
+    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
 
 logger = logging.getLogger(__name__)
 
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
-    return 0.5 * (1. + math.cos(math.pi * x_))
-
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x/warmup
-    return 1.0
-
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to OpenAIAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x/warmup
-    return max((x-1.)/(warmup-1.), 0)
-
-SCHEDULES = {
-    'warmup_cosine':warmup_cosine,
-    'warmup_constant':warmup_constant,
-    'warmup_linear':warmup_linear,
-}
-
 
 class OpenAIAdam(Optimizer):
     """Implements Open AI version of Adam algorithm with weight decay fix.
@@ -58,17 +34,23 @@ def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_t
                  vector_l2=False, max_grad_norm=-1, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {}".format(b1))
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
         if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {}".format(b2))
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
         if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {}".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
         super(OpenAIAdam, self).__init__(params, defaults)
@@ -80,11 +62,8 @@ def get_lr(self):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
                 lr.append(lr_scheduled)
         return lr
 
@@ -99,8 +78,6 @@ def step(self, closure=None):
         if closure is not None:
             loss = closure()
 
-        warned_for_t_total = False
-
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
@@ -136,19 +113,8 @@ def step(self, closure=None):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
 
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step']/group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
-                    # warning for exceeding t_total (only active with warmup_linear
-                    if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
-                        logger.warning(
-                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
-                            "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
-                        warned_for_t_total = True
-                    # end warning
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
 
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 3937d6e0118921..d37165d888135d 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -34,6 +34,12 @@
     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-base-uncased': 512,
@@ -43,6 +49,12 @@
     'bert-base-multilingual-uncased': 512,
     'bert-base-multilingual-cased': 512,
     'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
+    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
+    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
+    'bert-base-cased-finetuned-mrpc': 512,
 }
 VOCAB_NAME = 'vocab.txt'
 
@@ -175,13 +187,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        vocab_file))
             return None
         if resolved_vocab_file == vocab_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 07777292a3fa50..78f7f59d656e20 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -37,9 +37,11 @@ def lru_cache():
 
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
 }
 PRETRAINED_MERGES_ARCHIVE_MAP = {
     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'gpt2': 1024,
@@ -91,7 +93,7 @@ class GPT2Tokenizer(object):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Instantiate a GPT2Tokenizer from a pre-trained model file.
         Download and cache the pre-trained model file if needed.
         """
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
@@ -111,14 +113,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
             return None
         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
@@ -221,7 +228,10 @@ def tokenize(self, text):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 
@@ -260,9 +270,14 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     def encode(self, text):
         return self.convert_tokens_to_ids(self.tokenize(text))
 
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
+    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        if clean_up_tokenization_spaces:
+            text = text.replace('<unk>', '')
+            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
         return text
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 214a476ce9604a..52d735efa812ae 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -101,14 +101,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
             return None
         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
@@ -272,7 +277,7 @@ def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=Tr
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         if clean_up_tokenization_spaces:
             out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
                     ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
                     ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
         return out_string
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index ddebc57c1068e6..6a882e0a7f9c4b 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -71,14 +71,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file))
             return None
         if resolved_vocab_file == vocab_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
diff --git a/setup.py b/setup.py
index 4cf9739a4b19c3..fe7990447df444 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="pytorch_pretrained_bert",
-    version="0.6.1",
+    version="0.6.2",
     author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 8f4581b37f8473..589de22f5d93a8 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -41,6 +41,7 @@ def __init__(self,
                      use_token_type_ids=True,
                      use_labels=True,
                      vocab_size=99,
+                     n_special=1,
                      n_positions=33,
                      n_embd=32,
                      n_layer=5,
@@ -58,6 +59,7 @@ def __init__(self,
             self.use_token_type_ids = use_token_type_ids
             self.use_labels = use_labels
             self.vocab_size = vocab_size
+            self.n_special = n_special
             self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
@@ -69,7 +71,8 @@ def __init__(self,
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+            total_num_tokens = self.vocab_size + self.n_special
+            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
             position_ids = None
             if self.use_position_ids:
@@ -90,6 +93,7 @@ def prepare_config_and_inputs(self):
 
             config = GPT2Config(
                 vocab_size_or_config_json_file=self.vocab_size,
+                n_special=self.n_special,
                 n_positions=self.n_positions,
                 n_embd=self.n_embd,
                 n_layer=self.n_layer,
@@ -111,8 +115,9 @@ def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
             return outputs
 
         def check_gpt2_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
             self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
 
 
@@ -129,11 +134,29 @@ def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
             }
             return outputs
 
+        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2LMHeadModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
         def check_gpt2_lm_head_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertEqual(self.n_layer, len(result["presents"]))
+            self.parent.assertListEqual(
+                list(result["presents"][0].size()),
+                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])
 
         def check_gpt2_lm_head_loss_output(self, result):
             self.parent.assertListEqual(
@@ -156,8 +179,25 @@ def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_i
             }
             return outputs
 
+        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2DoubleHeadsModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, mc_token_ids,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
         def check_gpt2_double_heads_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -170,6 +210,98 @@ def check_gpt2_double_heads_loss_output(self, result):
                 [list(l.size()) for l in result["loss"]],
                 [[], []])
 
+        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                   mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, GPT2Model) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
     def test_default(self):
         self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
 
@@ -208,6 +340,9 @@ def run_tester(self, tester):
         tester.check_gpt2_double_heads_output(output_result)
         tester.check_gpt2_double_heads_loss_output(output_result)
 
+        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
+        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 4e7d9d542b0ba4..c8fc8f48fc3be9 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -125,8 +125,9 @@ def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
             return outputs
 
         def check_openai_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
             self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
 
 
@@ -182,6 +183,99 @@ def check_openai_double_heads_loss_output(self, result):
                 [list(l.size()) for l in result["loss"]],
                 [[], []])
 
+        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+
+        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                     mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
     def test_default(self):
         self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
 
@@ -220,6 +314,9 @@ def run_tester(self, tester):
         tester.check_openai_double_heads_output(output_result)
         tester.check_openai_double_heads_loss_output(output_result)
 
+        tester.create_and_check_openai_for_headmasking(*config_and_inputs)
+        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 5cde383fdfe76d..126c6fad13a32e 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -28,7 +28,7 @@
 from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification)
+                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -56,6 +56,7 @@ def __init__(self,
                      type_sequence_label_size=2,
                      initializer_range=0.02,
                      num_labels=3,
+                     num_choices=4,
                      scope=None):
             self.parent = parent
             self.batch_size = batch_size
@@ -77,6 +78,7 @@ def __init__(self,
             self.type_sequence_label_size = type_sequence_label_size
             self.initializer_range = initializer_range
             self.num_labels = num_labels
+            self.num_choices = num_choices
             self.scope = scope
 
         def prepare_config_and_inputs(self):
@@ -92,9 +94,11 @@ def prepare_config_and_inputs(self):
 
             sequence_labels = None
             token_labels = None
+            choice_labels = None
             if self.use_labels:
                 sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = BertModelTest.ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -109,14 +113,14 @@ def prepare_config_and_inputs(self):
                 type_vocab_size=self.type_vocab_size,
                 initializer_range=self.initializer_range)
 
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
                 [])
 
-        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
             model.eval()
             all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
@@ -137,7 +141,7 @@ def check_bert_model_output(self, result):
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
 
-        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
@@ -153,7 +157,7 @@ def check_bert_for_masked_lm_output(self, result):
                 list(result["prediction_scores"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
 
-        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@@ -170,7 +174,7 @@ def check_bert_for_next_sequence_prediction_output(self, result):
                 [self.batch_size, 2])
 
 
-        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
@@ -191,7 +195,7 @@ def check_bert_for_pretraining_output(self, result):
                 [self.batch_size, 2])
 
 
-        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
@@ -212,7 +216,7 @@ def check_bert_for_question_answering_output(self, result):
                 [self.batch_size, self.seq_length])
 
 
-        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@@ -229,7 +233,7 @@ def check_bert_for_sequence_classification_output(self, result):
                 [self.batch_size, self.num_labels])
 
 
-        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForTokenClassification(config=config, num_labels=self.num_labels)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
@@ -246,6 +250,150 @@ def check_bert_for_token_classification_output(self, result):
                 [self.batch_size, self.seq_length, self.num_labels])
 
 
+        def create_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForMultipleChoice(config=config, num_choices=self.num_choices)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            loss = model(multiple_choice_inputs_ids,
+                         multiple_choice_token_type_ids,
+                         multiple_choice_input_mask,
+                         choice_labels)
+            logits = model(multiple_choice_inputs_ids,
+                           multiple_choice_token_type_ids,
+                           multiple_choice_input_mask)
+            outputs = {
+                "loss": loss,
+                "logits": logits,
+            }
+            return outputs
+
+        def check_bert_for_multiple_choice(self, result):
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_choices])
+
+
+        def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config, num_labels=self.num_labels, output_attentions=True)
+                else:
+                    model = model_class(config=config, output_attentions=True)
+                model.eval()
+                output = model(input_ids, token_type_ids, input_mask)
+                attentions = output[0]
+                self.parent.assertEqual(len(attentions), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(attentions[0].size()),
+                    [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
+
+
+        def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
+                output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+
+        def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                bert_model = model if isinstance(model, BertModel) else model.bert
+                heads_to_prune = {0: list(range(1, self.num_attention_heads)),
+                                  -1: [0]}
+                bert_model.prune_heads(heads_to_prune)
+                output = model(input_ids, token_type_ids, input_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = bert_model.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, 1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads-1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+
+
     def test_default(self):
         self.run_tester(BertModelTest.BertModelTester(self))
 
@@ -300,6 +448,14 @@ def run_tester(self, tester):
         tester.check_bert_for_token_classification_output(output_result)
         tester.check_loss_output(output_result)
 
+        output_result = tester.create_bert_for_multiple_choice(*config_and_inputs)
+        tester.check_bert_for_multiple_choice(output_result)
+        tester.check_loss_output(output_result)
+
+        tester.create_and_check_bert_for_attentions(*config_and_inputs)
+        tester.create_and_check_bert_for_headmasking(*config_and_inputs)
+        tester.create_and_check_bert_for_head_pruning(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 848b9d1cf5c2f1..c6924bd4bcbe31 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -21,6 +21,11 @@
 import torch
 
 from pytorch_pretrained_bert import BertAdam
+from pytorch_pretrained_bert import OpenAIAdam
+from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+    WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
+import numpy as np
+
 
 class OptimizationTest(unittest.TestCase):
 
@@ -46,5 +51,41 @@ def test_adam(self):
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
 
+class ScheduleInitTest(unittest.TestCase):
+    def test_bert_sched_init(self):
+        m = torch.nn.Linear(50, 50)
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
+        # shouldn't fail
+
+    def test_openai_sched_init(self):
+        m = torch.nn.Linear(50, 50)
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
+        # shouldn't fail
+
+
+class WarmupCosineWithRestartsTest(unittest.TestCase):
+    def test_it(self):
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)
+        x = np.arange(0, 1000)
+        y = [m.get_lr(xe) for xe in x]
+        y = np.asarray(y)
+        expected_zeros = y[[0, 200, 400, 600, 800]]
+        print(expected_zeros)
+        expected_ones = y[[50, 250, 450, 650, 850]]
+        print(expected_ones)
+        self.assertTrue(np.allclose(expected_ones, 1))
+        self.assertTrue(np.allclose(expected_zeros, 0))
+
+
 if __name__ == "__main__":
     unittest.main()