From b6c1cae67b139c6c27112147375b04ca1cfec3f8 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:32:04 +0100
Subject: [PATCH 001/144] branches, optim cosine fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 9a873e221b8857..eb24c3bd37a560 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -52,7 +52,7 @@ def get_lr(self, step, nowarn=False):
 
     def get_lr_(self, step):
         return 1.
-        # raise NotImplemented("use subclass")
+        # raise NotImplemented("use subclass")  -
 
 
 class WarmupCosineSchedule(LRSchedule):

From 262a9992d7ab348dfc35bda6c550fbbba8f5bc42 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 18:29:12 +0100
Subject: [PATCH 002/144] class weights

---
 pytorch_pretrained_bert/optimization.py | 21 ++++++++++++++++++---
 tests/optimization_test.py              | 15 ++++++++++++++-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index eb24c3bd37a560..a39a18cea3ca3e 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -24,7 +24,8 @@
 logger = logging.getLogger(__name__)
 
 
-__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam", "WarmupCosineWithRestartsSchedule"]
+__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam",
+           "WarmupMultiCosineSchedule", "WarmupCosineWithRestartsSchedule"]
 
 
 class LRSchedule(object):
@@ -72,10 +73,11 @@ def get_lr_(self, progress):
             return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
-class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
+class WarmupMultiCosineSchedule(WarmupCosineSchedule):
     warn_t_total = True
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        super(WarmupCosineWithRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        super(WarmupMultiCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
 
     def get_lr_(self, progress):
         if self.t_total <= 0:
@@ -88,6 +90,19 @@ def get_lr_(self, progress):
             return ret
 
 
+class WarmupCosineWithRestartsSchedule(WarmupMultiCosineSchedule):
+    def get_lr_(self, progress):
+        if self.t_total <= 0.:
+            return 1.
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+
+
 class WarmupConstantSchedule(LRSchedule):
     warn_t_total = False
     def get_lr_(self, progress):
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 848b9d1cf5c2f1..3f9f8abbfe83b6 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -20,7 +20,9 @@
 
 import torch
 
-from pytorch_pretrained_bert import BertAdam
+from pytorch_pretrained_bert import BertAdam, WarmupCosineWithRestartsSchedule
+from matplotlib import pyplot as plt
+import numpy as np
 
 class OptimizationTest(unittest.TestCase):
 
@@ -46,5 +48,16 @@ def test_adam(self):
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
 
+class WarmupCosineWithRestartsTest(unittest.TestCase):
+    def test_it(self):
+        m = WarmupCosineWithRestartsSchedule(warmup=0.2, t_total=1, cycles=3)
+        x = np.arange(0, 1000) / 1000
+        y = [m.get_lr_(xe) for xe in x]
+        plt.plot(y)
+        plt.show()
+
+
+
+
 if __name__ == "__main__":
     unittest.main()

From 1758c8fc722bc2b8a80bca6786d891fbe46fb7a2 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 16:08:34 +0200
Subject: [PATCH 003/144] - updated docs for optimization

---
 pytorch_pretrained_bert/optimization.py       | 68 +++++++++++++-----
 .../optimization_openai.py                    | 71 +++++--------------
 2 files changed, 70 insertions(+), 69 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index a39a18cea3ca3e..565d3bff4533ec 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -25,12 +25,18 @@
 
 
 __all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam",
-           "WarmupMultiCosineSchedule", "WarmupCosineWithRestartsSchedule"]
+           "WarmupCosineWithHardRestartsSchedule", "WarmupCosineWithWarmupRestartsSchedule", "SCHEDULES"]
 
 
 class LRSchedule(object):
-    warn_t_total = False
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
     def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
         super(LRSchedule, self).__init__(**kw)
         self.warmup, self.t_total = warmup, t_total
         if t_total <= 0:
@@ -40,6 +46,11 @@ def __init__(self, warmup=0.002, t_total=-1, **kw):
         self.warned_for_t_total_at_progress = -1
 
     def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
         progress = step / self.t_total
         ret = self.get_lr_(progress)
         # warning for exceeding t_total (only active with warmup_linear
@@ -51,14 +62,27 @@ def get_lr(self, step, nowarn=False):
         # end warning
         return ret
 
-    def get_lr_(self, step):
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
         return 1.
         # raise NotImplemented("use subclass")  -
 
 
 class WarmupCosineSchedule(LRSchedule):
+    """
+    Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
+    """
     warn_t_total = True
     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
         super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
         self.cycles = cycles
 
@@ -73,10 +97,12 @@ def get_lr_(self, progress):
             return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
-class WarmupMultiCosineSchedule(WarmupCosineSchedule):
-    warn_t_total = True
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1).
+    """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        super(WarmupMultiCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
         assert(cycles >= 1.)
 
     def get_lr_(self, progress):
@@ -90,7 +116,16 @@ def get_lr_(self, progress):
             return ret
 
 
-class WarmupCosineWithRestartsSchedule(WarmupMultiCosineSchedule):
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    Cosine learning rate schedule with linear warmups and linear warmup restarts.
+    The same warmup rate is used for warmup restarts as for initial warmup.
+    The total effective fraction of warmup steps over all cycles is warmup * cycles!
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup*cycles, t_total=t_total, cycles=cycles, **kw)
+
     def get_lr_(self, progress):
         if self.t_total <= 0.:
             return 1.
@@ -104,7 +139,9 @@ def get_lr_(self, progress):
 
 
 class WarmupConstantSchedule(LRSchedule):
-    warn_t_total = False
+    """
+    Applies linear warmup. After warmup always returns 1..
+    """
     def get_lr_(self, progress):
         if progress < self.warmup:
             return progress / self.warmup
@@ -112,6 +149,9 @@ def get_lr_(self, progress):
 
 
 class WarmupLinearSchedule(LRSchedule):
+    """
+    Linear warmup. Linear decay after warmup.
+    """
     warn_t_total = True
     def get_lr_(self, progress):
         if progress < self.warmup:
@@ -145,8 +185,7 @@ class BertAdam(Optimizer):
         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
     """
     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, init_weight_decay=0.,
-                 max_grad_norm=1.0):
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
@@ -163,9 +202,10 @@ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
             if warmup != -1 or t_total != -1:
-                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.")
+                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
+                               "Please specify custom warmup and t_total in LRSchedule object.")
         defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, init_weight_decay=init_weight_decay,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
 
@@ -176,10 +216,8 @@ def get_lr(self):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-
                 lr_scheduled = group['lr']
                 lr_scheduled *= group['schedule'].get_lr(state['step'])
-
                 lr.append(lr_scheduled)
         return lr
 
@@ -235,8 +273,6 @@ def step(self, closure=None):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
-                # TODO: init weight decay
-
                 lr_scheduled = group['lr']
                 lr_scheduled *= group['schedule'].get_lr(state['step'])
 
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 99ac15e1089a51..5bfea476a68e82 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -20,35 +20,10 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
+from .optimization import *
 
 logger = logging.getLogger(__name__)
 
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
-    return 0.5 * (1. + math.cos(math.pi * x_))
-
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x/warmup
-    return 1.0
-
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to OpenAIAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x/warmup
-    return max((x-1.)/(warmup-1.), 0)
-
-SCHEDULES = {
-    'warmup_cosine':warmup_cosine,
-    'warmup_constant':warmup_constant,
-    'warmup_linear':warmup_linear,
-}
-
 
 class OpenAIAdam(Optimizer):
     """Implements Open AI version of Adam algorithm with weight decay fix.
@@ -58,17 +33,23 @@ def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_t
                  vector_l2=False, max_grad_norm=-1, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
+        if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {}".format(b1))
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
         if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {}".format(b2))
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
         if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {}".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
+                               "Please specify custom warmup and t_total in LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
         super(OpenAIAdam, self).__init__(params, defaults)
@@ -80,11 +61,8 @@ def get_lr(self):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
                 lr.append(lr_scheduled)
         return lr
 
@@ -99,8 +77,6 @@ def step(self, closure=None):
         if closure is not None:
             loss = closure()
 
-        warned_for_t_total = False
-
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
@@ -136,19 +112,8 @@ def step(self, closure=None):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
 
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step']/group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
-                    # warning for exceeding t_total (only active with warmup_linear
-                    if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
-                        logger.warning(
-                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
-                            "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
-                        warned_for_t_total = True
-                    # end warning
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
 

From d164867d90c7b352445aa7d4028a6ba156a70a77 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 16:13:51 +0200
Subject: [PATCH 004/144] - updated docs for optimization

---
 tests/optimization_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 3f9f8abbfe83b6..8c28ad38adf6da 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -20,7 +20,8 @@
 
 import torch
 
-from pytorch_pretrained_bert import BertAdam, WarmupCosineWithRestartsSchedule
+from pytorch_pretrained_bert import BertAdam
+from pytorch_pretrained_bert.optimization import WarmupCosineWithWarmupRestartsSchedule
 from matplotlib import pyplot as plt
 import numpy as np
 
@@ -50,7 +51,7 @@ def test_adam(self):
 
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
-        m = WarmupCosineWithRestartsSchedule(warmup=0.2, t_total=1, cycles=3)
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1, cycles=3)
         x = np.arange(0, 1000) / 1000
         y = [m.get_lr_(xe) for xe in x]
         plt.plot(y)

From b64cc63a772dc981040ad8747efbf319ebb4945a Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 16:42:40 +0200
Subject: [PATCH 005/144] optimization schedule test update

---
 tests/optimization_test.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 8c28ad38adf6da..218da7581f816b 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -51,11 +51,18 @@ def test_adam(self):
 
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1, cycles=3)
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1, cycles=5)
         x = np.arange(0, 1000) / 1000
         y = [m.get_lr_(xe) for xe in x]
         plt.plot(y)
-        plt.show()
+        plt.show(block=False)
+        y = np.asarray(y)
+        expected_zeros = y[[0, 200, 400, 600, 800]]
+        print(expected_zeros)
+        expected_ones = y[[50, 250, 450, 650, 850]]
+        print(expected_ones)
+        self.assertTrue(np.allclose(expected_ones, 1))
+        self.assertTrue(np.allclose(expected_zeros, 0))
 
 
 

From 91a073f80458ef7ca65f8b1e4af35b8061155794 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 17:10:08 +0200
Subject: [PATCH 006/144] schedule fix

---
 pytorch_pretrained_bert/optimization.py | 17 +++++++----------
 tests/optimization_test.py              |  6 +++---
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 565d3bff4533ec..8c8dc3b8624df9 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -38,11 +38,12 @@ def __init__(self, warmup=0.002, t_total=-1, **kw):
         :param kw:
         """
         super(LRSchedule, self).__init__(**kw)
-        self.warmup, self.t_total = warmup, t_total
         if t_total <= 0:
             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
         if not 0.0 <= warmup < 1.0 and not warmup == -1:
             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0)
+        self.warmup, self.t_total = warmup, t_total
         self.warned_for_t_total_at_progress = -1
 
     def get_lr(self, step, nowarn=False):
@@ -51,6 +52,8 @@ def get_lr(self, step, nowarn=False):
         :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
         :return:        learning rate multiplier for current update
         """
+        if self.t_total < 0:
+            return 1.
         progress = step / self.t_total
         ret = self.get_lr_(progress)
         # warning for exceeding t_total (only active with warmup_linear
@@ -87,9 +90,6 @@ def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
         self.cycles = cycles
 
     def get_lr_(self, progress):
-        """ get learning rate multiplier """
-        if self.t_total <= 0:
-            return 1.
         if progress < self.warmup:
             return progress / self.warmup
         else:
@@ -106,8 +106,6 @@ def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         assert(cycles >= 1.)
 
     def get_lr_(self, progress):
-        if self.t_total <= 0:
-            return 1.
         if progress < self.warmup:
             return progress / self.warmup
         else:
@@ -124,11 +122,10 @@ class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedul
     """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         assert(warmup * cycles < 1.)
-        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup*cycles, t_total=t_total, cycles=cycles, **kw)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
 
     def get_lr_(self, progress):
-        if self.t_total <= 0.:
-            return 1.
         progress = progress * self.cycles % 1.
         if progress < self.warmup:
             return progress / self.warmup
@@ -174,7 +171,7 @@ class BertAdam(Optimizer):
         lr: learning rate
         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
         t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate. Default: -1
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
         schedule: schedule to use for the warmup (see above).
             Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
             Default: 'warmup_linear'
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 218da7581f816b..0eaae16d310911 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -51,9 +51,9 @@ def test_adam(self):
 
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1, cycles=5)
-        x = np.arange(0, 1000) / 1000
-        y = [m.get_lr_(xe) for xe in x]
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=-1, t_total=500, cycles=5)
+        x = np.arange(0, 1000)
+        y = [m.get_lr(xe) for xe in x]
         plt.plot(y)
         plt.show(block=False)
         y = np.asarray(y)

From 23bd2eebf53ddd92eac4a1d4589b773028556246 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 17:10:34 +0200
Subject: [PATCH 007/144] schedule fix

---
 tests/optimization_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 0eaae16d310911..f3147c8998ca81 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -51,7 +51,7 @@ def test_adam(self):
 
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=-1, t_total=500, cycles=5)
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000, cycles=5)
         x = np.arange(0, 1000)
         y = [m.get_lr(xe) for xe in x]
         plt.plot(y)

From 5fed5bb3d687c9eafe04ec5e22f937c5355e53ce Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 17:20:29 +0200
Subject: [PATCH 008/144] schedule fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 8c8dc3b8624df9..92cf2b05eb84a5 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -38,7 +38,7 @@ def __init__(self, warmup=0.002, t_total=-1, **kw):
         :param kw:
         """
         super(LRSchedule, self).__init__(**kw)
-        if t_total <= 0:
+        if t_total < 0:
             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
         if not 0.0 <= warmup < 1.0 and not warmup == -1:
             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))

From 1b4ce76c3885357bbaa975b30ad32d4e1f47f032 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 17:40:12 +0200
Subject: [PATCH 009/144] schedule fix

---
 tests/optimization_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index f3147c8998ca81..80216cc8d4255a 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -22,7 +22,7 @@
 
 from pytorch_pretrained_bert import BertAdam
 from pytorch_pretrained_bert.optimization import WarmupCosineWithWarmupRestartsSchedule
-from matplotlib import pyplot as plt
+#from matplotlib import pyplot as plt
 import numpy as np
 
 class OptimizationTest(unittest.TestCase):
@@ -54,8 +54,8 @@ def test_it(self):
         m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000, cycles=5)
         x = np.arange(0, 1000)
         y = [m.get_lr(xe) for xe in x]
-        plt.plot(y)
-        plt.show(block=False)
+        # plt.plot(y)
+        # plt.show(block=False)
         y = np.asarray(y)
         expected_zeros = y[[0, 200, 400, 600, 800]]
         print(expected_zeros)

From 20686b78fc786bf662b4ed1bd743823aeef57fd8 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 18:13:52 +0200
Subject: [PATCH 010/144] schedule fix

---
 pytorch_pretrained_bert/optimization.py | 6 +++---
 tests/optimization_test.py              | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 92cf2b05eb84a5..df5b50b51df42b 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -42,8 +42,8 @@ def __init__(self, warmup=0.002, t_total=-1, **kw):
             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
         if not 0.0 <= warmup < 1.0 and not warmup == -1:
             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
-        warmup = max(warmup, 0)
-        self.warmup, self.t_total = warmup, t_total
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
         self.warned_for_t_total_at_progress = -1
 
     def get_lr(self, step, nowarn=False):
@@ -153,7 +153,7 @@ class WarmupLinearSchedule(LRSchedule):
     def get_lr_(self, progress):
         if progress < self.warmup:
             return progress / self.warmup
-        return max((progress - 1.) / (self.warmup - 1.), 0)
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
 
 
 SCHEDULES = {
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index 80216cc8d4255a..e74f4bba6ca6ad 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -51,7 +51,7 @@ def test_adam(self):
 
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000, cycles=5)
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)
         x = np.arange(0, 1000)
         y = [m.get_lr(xe) for xe in x]
         # plt.plot(y)

From fc7693adc33484942a92ebba63117bf166883c0e Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 3 Apr 2019 18:16:47 +0200
Subject: [PATCH 011/144] schedule fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index df5b50b51df42b..ca973015a676e0 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -54,7 +54,7 @@ def get_lr(self, step, nowarn=False):
         """
         if self.t_total < 0:
             return 1.
-        progress = step / self.t_total
+        progress = float(step) / self.t_total
         ret = self.get_lr_(progress)
         # warning for exceeding t_total (only active with warmup_linear
         if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:

From 38ba7b439bcdadc73db03bfd7504fae44f74ab93 Mon Sep 17 00:00:00 2001
From: David Pollack <david@da3.net>
Date: Mon, 15 Apr 2019 10:38:01 +0200
Subject: [PATCH 012/144] fixed BertForMultipleChoice model init and forward
 pass

---
 pytorch_pretrained_bert/modeling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2736e34d7f6a90..374a57c34f9339 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -1034,13 +1034,13 @@ def __init__(self, config, num_choices):
         self.num_choices = num_choices
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.classifier = nn.Linear(config.hidden_size, num_choices)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
         _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)

From ae4c9fee734e587a1873c88a44909a93bb478c36 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Sat, 13 Apr 2019 15:00:48 -0700
Subject: [PATCH 013/144] add hubconf

---
 hubconf.py | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 hubconf.py

diff --git a/hubconf.py b/hubconf.py
new file mode 100644
index 00000000000000..b2e44af5d914ca
--- /dev/null
+++ b/hubconf.py
@@ -0,0 +1,187 @@
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling import (BertForNextSentencePrediction,
+                                              BertForMaskedLM,
+                                              BertForMultipleChoice,
+                                              BertForPreTraining,
+                                              BertForQuestionAnswering,
+                                              BertForSequenceClassification,
+                                              )
+
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
+
+
+def bertTokenizer(*args, **kwargs):
+    """
+    Instantiate a BertTokenizer from a pre-trained/customized vocab file
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * bert-base-uncased
+                                       * bert-large-uncased
+                                       * bert-base-cased
+                                       * bert-large-cased
+                                       * bert-base-multilingual-uncased
+                                       * bert-base-multilingual-cased
+                                       * bert-base-chinese
+    Keyword args:
+    cache_dir: an optional path to a specific directory to download and cache
+               the pre-trained model weights.
+               Default: None
+    do_lower_case: Whether to lower case the input.
+                   Only has an effect when do_wordpiece_only=False
+                   Default: True
+    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+                       Default: True
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+    never_split: List of tokens which will never be split during tokenization.
+                 Only has an effect when do_wordpiece_only=False
+                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
+
+    Example:
+        >>> sentence = 'Hello, World!'
+        >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'BertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> toks = tokenizer.tokenize(sentence)
+        ['Hello', '##,', 'World', '##!']
+        >>> ids = tokenizer.convert_tokens_to_ids(toks)
+        [8667, 28136, 1291, 28125]
+    """
+    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+def bertForNextSentencePrediction(*args, **kwargs):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+    """
+    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
+    return model
+
+
+def bertForPreTraining(*args, **kwargs):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+
+    """
+    model = BertForPreTraining.from_pretrained(*args, **kwargs)
+    return model
+
+
+def bertForMaskedLM(*args, **kwargs):
+    """
+    BertForMaskedLM includes the BertModel Transformer followed by the (possibly)
+    pre-trained masked language modeling head.
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+    """
+    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
+    return model
+
+
+#def bertForSequenceClassification(*args, **kwargs):
+#    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
+#    return model
+
+
+#def bertForMultipleChoice(*args, **kwargs):
+#    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
+#    return model
+
+
+def bertForQuestionAnswering(*args, **kwargs):
+    """
+    BertForQuestionAnswering is a fine-tuning model that includes BertModel with
+    a token-level classifiers on top of the full sequence of last hidden states.
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+    """
+    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
+    return model
+
+
+

From bfd6f6b257f2d4857f65bbcd6cb3487123fe848f Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Wed, 17 Apr 2019 13:39:46 -0700
Subject: [PATCH 014/144] fix from_pretrained positional args

---
 hubconf.py                          | 214 ++++++++++++++--------------
 pytorch_pretrained_bert/modeling.py |  10 +-
 2 files changed, 115 insertions(+), 109 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index b2e44af5d914ca..755e181d201f3f 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,14 +1,55 @@
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import (BertForNextSentencePrediction,
-                                              BertForMaskedLM,
-                                              BertForMultipleChoice,
-                                              BertForPreTraining,
-                                              BertForQuestionAnswering,
-                                              BertForSequenceClassification,
-                                              )
+from pytorch_pretrained_bert.modeling import (
+        BertModel,
+        BertForNextSentencePrediction,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        )
 
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
 
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+bert_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
+                  instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow
+                 checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models
+                   will be cached.
+        state_dict: an optional state dictionnary
+                    (collections.OrderedDict object) to use instead of Google
+                    pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
 
 def bertTokenizer(*args, **kwargs):
     """
@@ -43,7 +84,7 @@ def bertTokenizer(*args, **kwargs):
 
     Example:
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'BertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -53,135 +94,94 @@ def bertTokenizer(*args, **kwargs):
     return tokenizer
 
 
+@_append_from_pretrained_docstring(bert_docstring)
+def bertModel(*args, **kwargs):
+    """
+    BertModel is the basic BERT Transformer model with a layer of summed token,
+    position and sequence embeddings followed by a series of identical
+    self-attention blocks (12 for BERT-base, 24 for BERT-large).
+    """
+    model = BertModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
 def bertForNextSentencePrediction(*args, **kwargs):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
+    """
+    BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence
+    classification head.
     """
     model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
     return model
 
 
+@_append_from_pretrained_docstring(bert_docstring)
 def bertForPreTraining(*args, **kwargs):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
+    """
+    BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads
         - the masked language modeling head, and
         - the next sentence classification head.
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
     return model
 
 
+@_append_from_pretrained_docstring(bert_docstring)
 def bertForMaskedLM(*args, **kwargs):
     """
-    BertForMaskedLM includes the BertModel Transformer followed by the (possibly)
-    pre-trained masked language modeling head.
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
+    BertForMaskedLM includes the BertModel Transformer followed by the
+    (possibly) pre-trained masked language modeling head.
     """
     model = BertForMaskedLM.from_pretrained(*args, **kwargs)
     return model
 
 
-#def bertForSequenceClassification(*args, **kwargs):
-#    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-#    return model
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForSequenceClassification(*args, **kwargs):
+    """
+    BertForSequenceClassification is a fine-tuning model that includes
+    BertModel and a sequence-level (sequence or pair of sequences) classifier
+    on top of the BertModel.
+
+    The sequence-level classifier is a linear layer that takes as input the
+    last hidden state of the first character in the input sequence
+    (see Figures 3a and 3b in the BERT paper).
+    """
+    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
+    return model
 
 
-#def bertForMultipleChoice(*args, **kwargs):
-#    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-#    return model
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMultipleChoice(*args, **kwargs):
+    """
+    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
+    linear layer on top of the BertModel.
+    """
+    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
+    return model
 
 
+@_append_from_pretrained_docstring(bert_docstring)
 def bertForQuestionAnswering(*args, **kwargs):
     """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel with
-    a token-level classifiers on top of the full sequence of last hidden states.
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
+    BertForQuestionAnswering is a fine-tuning model that includes BertModel
+    with a token-level classifiers on top of the full sequence of last hidden
+    states.
     """
     model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
     return model
 
 
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForTokenClassification(*args, **kwargs):
+    """
+    BertForTokenClassification is a fine-tuning model that includes BertModel
+    and a token-level classifier on top of the BertModel.
 
+    The token-level classifier is a linear layer that takes as input the last
+    hidden state of the sequence.
+    """
+    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
+    return model
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2736e34d7f6a90..9c9b031970b29f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -519,8 +519,7 @@ def init_bert_weights(self, module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -547,6 +546,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:

From bb7557d3ab96f139997bfaa70ff2b4a6c18994e0 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sun, 21 Apr 2019 13:48:33 +0200
Subject: [PATCH 015/144] - removed __all__ in optimization - removed unused
 plotting code - using ABC for LRSchedule - added some schedule object init
 tests

---
 pytorch_pretrained_bert/optimization.py       | 30 ++++++++++---------
 .../optimization_openai.py                    |  7 +++--
 tests/optimization_test.py                    | 29 +++++++++++++++---
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index ca973015a676e0..d2d4f7f5e5a58b 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -20,15 +20,12 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
+from abc import ABC, abstractmethod
 
 logger = logging.getLogger(__name__)
 
 
-__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam",
-           "WarmupCosineWithHardRestartsSchedule", "WarmupCosineWithWarmupRestartsSchedule", "SCHEDULES"]
-
-
-class LRSchedule(object):
+class _LRSchedule(ABC):
     """ Parent of all LRSchedules here. """
     warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
     def __init__(self, warmup=0.002, t_total=-1, **kw):
@@ -37,7 +34,7 @@ def __init__(self, warmup=0.002, t_total=-1, **kw):
         :param t_total: how many training steps (updates) are planned
         :param kw:
         """
-        super(LRSchedule, self).__init__(**kw)
+        super(_LRSchedule, self).__init__(**kw)
         if t_total < 0:
             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
         if not 0.0 <= warmup < 1.0 and not warmup == -1:
@@ -65,16 +62,21 @@ def get_lr(self, step, nowarn=False):
         # end warning
         return ret
 
+    @abstractmethod
     def get_lr_(self, progress):
         """
         :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
         :return:            learning rate multiplier for current update
         """
         return 1.
-        # raise NotImplemented("use subclass")  -
 
 
-class WarmupCosineSchedule(LRSchedule):
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+
+
+class WarmupCosineSchedule(_LRSchedule):
     """
     Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
     """
@@ -135,7 +137,7 @@ def get_lr_(self, progress):
             return ret
 
 
-class WarmupConstantSchedule(LRSchedule):
+class WarmupConstantSchedule(_LRSchedule):
     """
     Applies linear warmup. After warmup always returns 1..
     """
@@ -145,7 +147,7 @@ def get_lr_(self, progress):
         return 1.
 
 
-class WarmupLinearSchedule(LRSchedule):
+class WarmupLinearSchedule(_LRSchedule):
     """
     Linear warmup. Linear decay after warmup.
     """
@@ -157,8 +159,8 @@ def get_lr_(self, progress):
 
 
 SCHEDULES = {
-    None:       LRSchedule,
-    "none":     LRSchedule,
+    None:       ConstantLR,
+    "none":     ConstantLR,
     "warmup_cosine": WarmupCosineSchedule,
     "warmup_constant": WarmupConstantSchedule,
     "warmup_linear": WarmupLinearSchedule
@@ -185,7 +187,7 @@ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_
                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
@@ -194,7 +196,7 @@ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
         # initialize schedule object
-        if not isinstance(schedule, LRSchedule):
+        if not isinstance(schedule, _LRSchedule):
             schedule_type = SCHEDULES[schedule]
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 5bfea476a68e82..0cf0494e20634d 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -20,7 +20,8 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
-from .optimization import *
+from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
+    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
 
 logger = logging.getLogger(__name__)
 
@@ -33,7 +34,7 @@ def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_t
                  vector_l2=False, max_grad_norm=-1, **kwargs):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
@@ -42,7 +43,7 @@ def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_t
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
         # initialize schedule object
-        if not isinstance(schedule, LRSchedule):
+        if not isinstance(schedule, _LRSchedule):
             schedule_type = SCHEDULES[schedule]
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index e74f4bba6ca6ad..f52aeb506b3a73 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -21,10 +21,11 @@
 import torch
 
 from pytorch_pretrained_bert import BertAdam
-from pytorch_pretrained_bert.optimization import WarmupCosineWithWarmupRestartsSchedule
-#from matplotlib import pyplot as plt
+from pytorch_pretrained_bert import OpenAIAdam
+from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule
 import numpy as np
 
+
 class OptimizationTest(unittest.TestCase):
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -49,13 +50,33 @@ def test_adam(self):
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
 
+class ScheduleInitTest(unittest.TestCase):
+    def test_bert_sched_init(self):
+        m = torch.nn.Linear(50, 50)
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
+        # shouldn't fail
+
+    def test_openai_sched_init(self):
+        m = torch.nn.Linear(50, 50)
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
+        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
+        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
+        # shouldn't fail
+
+
 class WarmupCosineWithRestartsTest(unittest.TestCase):
     def test_it(self):
         m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)
         x = np.arange(0, 1000)
         y = [m.get_lr(xe) for xe in x]
-        # plt.plot(y)
-        # plt.show(block=False)
         y = np.asarray(y)
         expected_zeros = y[[0, 200, 400, 600, 800]]
         print(expected_zeros)

From 69850b40114095aaa093adaf4ef2181cfe4176ed Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sun, 21 Apr 2019 14:02:38 +0200
Subject: [PATCH 016/144] python 2 compat

---
 pytorch_pretrained_bert/optimization.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index d2d4f7f5e5a58b..7e88b1b61c6ef7 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -20,11 +20,18 @@
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 import logging
-from abc import ABC, abstractmethod
+import abc
+import sys
 
 logger = logging.getLogger(__name__)
 
 
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+
+
 class _LRSchedule(ABC):
     """ Parent of all LRSchedules here. """
     warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
@@ -62,7 +69,7 @@ def get_lr(self, step, nowarn=False):
         # end warning
         return ret
 
-    @abstractmethod
+    @abc.abstractmethod
     def get_lr_(self, progress):
         """
         :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress

From 14b1f719f4c9860a3df8663e7e4ecac25182034a Mon Sep 17 00:00:00 2001
From: Sangwhan Moon <sangwhan@iki.fi>
Date: Mon, 22 Apr 2019 02:20:22 +0900
Subject: [PATCH 017/144] Fix indentation weirdness in GPT-2 example.

---
 examples/run_gpt2.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 9e9e2b255bafef..8f8208bbcd54cd 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -107,25 +107,23 @@ def run_model():
                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                     print(text)
             print("=" * 80)
-      if args.unconditional:
-          generated = 0
-          for _ in range(args.nsamples // args.batch_size):
-              out = sample_sequence(
-                  model=model, length=args.length,
-                  context=None,
-                  start_token=enc.encoder['<|endoftext|>'],
-                  batch_size=args.batch_size,
-                  temperature=args.temperature, top_k=args.top_k, device=device
-              )
-              out = out[:,1:].tolist()
-              for i in range(args.batch_size):
-                  generated += 1
-                  text = enc.decode(out[i])
-                  print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                  print(text)
-          print("=" * 80)
-          if args.unconditional:
-              break
+        else:
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=None,
+                    start_token=enc.encoder['<|endoftext|>'],
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:,1:].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
 
 if __name__ == '__main__':
     run_model()

From d94c6b01445531a649f61b2faebfa767d8b1915f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 23 Apr 2019 11:17:06 +0200
Subject: [PATCH 018/144] fix training schedules in examples to match new API

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 9 +++++----
 examples/lm_finetuning/simple_lm_finetuning.py     | 7 +++++--
 examples/run_classifier.py                         | 7 +++++--
 examples/run_squad.py                              | 7 +++++--
 examples/run_swag.py                               | 7 +++++--
 5 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 5c3051f5009718..1638b02a6fa402 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -14,7 +14,7 @@
 
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
@@ -268,7 +268,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
@@ -314,8 +315,8 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps,
-                                                                          args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                                                                                 args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 0f8547333082fa..6511ead5902738 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -31,7 +31,7 @@
 
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
@@ -556,6 +556,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
 
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
@@ -601,7 +603,8 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                                                                                 args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index b90ac494e4fe37..bdcad6f0eb325c 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -38,7 +38,7 @@
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 logger = logging.getLogger(__name__)
 
@@ -784,6 +784,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
 
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
@@ -852,7 +854,8 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                                                                                 args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 410fd8529880ed..c3fdf03774fcd9 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -36,7 +36,7 @@
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
                                                   whitespace_tokenize)
@@ -949,6 +949,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
@@ -1013,7 +1015,8 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                                                                                 args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_swag.py b/examples/run_swag.py
index a6cfdbe311d578..bd724c48adc367 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -34,7 +34,7 @@
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -411,6 +411,8 @@ def main():
             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                             t_total=num_train_optimization_steps)
     else:
         optimizer = BertAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
@@ -464,7 +466,8 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                                                                                 args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()

From ed8fad73903c670d41a9dff173bc44995cda2d2f Mon Sep 17 00:00:00 2001
From: Mathieu Prouveur <mathieu@sancare.fr>
Date: Wed, 24 Apr 2019 14:07:00 +0200
Subject: [PATCH 019/144] Update example files so that tr_loss is not affected
 by args.gradient_accumulation_step

---
 examples/run_classifier.py | 2 +-
 examples/run_swag.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index b90ac494e4fe37..e14788cacb05de 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -845,7 +845,7 @@ def main():
                 else:
                     loss.backward()
 
-                tr_loss += loss.item()
+                tr_loss += loss.item() * args.gradient_accumulation_steps
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
diff --git a/examples/run_swag.py b/examples/run_swag.py
index a6cfdbe311d578..5a65d7a7487516 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -452,7 +452,7 @@ def main():
                     loss = loss * args.loss_scale
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
-                tr_loss += loss.item()
+                tr_loss += loss.item() * args.gradient_accumulation_steps
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
 

From 80f995a141e496a5ff9d7996057e835f24371cd4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 24 Apr 2019 16:51:54 +0200
Subject: [PATCH 020/144] revert BertForMultipleChoice linear classifier

---
 pytorch_pretrained_bert/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 374a57c34f9339..6b71f007c321bd 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -1034,7 +1034,7 @@ def __init__(self, config, num_choices):
         self.num_choices = num_choices
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_choices)
+        self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):

From 704037ad511ea85afd3de3609434fc49cbf2f1d2 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Thu, 25 Apr 2019 15:59:39 +0200
Subject: [PATCH 021/144] - updated docs for new LR API - added some images for
 illustration - updated comments in optimization

---
 README.md                                     |  31 +++++++++++++++++-
 docs/imgs/warmup_constant_schedule.png        | Bin 0 -> 9978 bytes
 .../warmup_cosine_hard_restarts_schedule.png  | Bin 0 -> 22350 bytes
 docs/imgs/warmup_cosine_schedule.png          | Bin 0 -> 17335 bytes
 .../warmup_cosine_warm_restarts_schedule.png  | Bin 0 -> 22315 bytes
 docs/imgs/warmup_linear_schedule.png          | Bin 0 -> 16775 bytes
 pytorch_pretrained_bert/optimization.py       |  29 +++++++++-------
 .../optimization_openai.py                    |   4 +--
 tests/optimization_test.py                    |  15 ++++++++-
 9 files changed, 64 insertions(+), 15 deletions(-)
 create mode 100644 docs/imgs/warmup_constant_schedule.png
 create mode 100644 docs/imgs/warmup_cosine_hard_restarts_schedule.png
 create mode 100644 docs/imgs/warmup_cosine_schedule.png
 create mode 100644 docs/imgs/warmup_cosine_warm_restarts_schedule.png
 create mode 100644 docs/imgs/warmup_linear_schedule.png

diff --git a/README.md b/README.md
index fde35d23ea9d72..b348fde28c3606 100644
--- a/README.md
+++ b/README.md
@@ -984,7 +984,10 @@ The optimizer accepts the following arguments:
 - `warmup` : portion of `t_total` for the warmup, `-1`  means no warmup. Default : `-1`
 - `t_total` : total number of training steps for the learning
     rate schedule, `-1`  means constant learning rate. Default : `-1`
-- `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'`
+- `schedule` : schedule to use for the warmup (see above).
+    Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+    If `None` or `'none'`, learning rate is always kept constant.
+    Default : `'warmup_linear'`
 - `b1` : Adams b1. Default : `0.9`
 - `b2` : Adams b2. Default : `0.999`
 - `e` : Adams epsilon. Default : `1e-6`
@@ -998,6 +1001,32 @@ The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as i
 
 `OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
 
+#### Learning Rate Schedules
+The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
+All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
+When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, 
+the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. 
+An overview of the implemented schedules:
+- `ConstantLR`: always returns learning rate 1.
+- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    ![](docs/imgs/warmup_constant_schedule.png)
+- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    ![](docs/imgs/warmup_linear_schedule.png)
+-  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    ![](docs/imgs/warmup_cosine_schedule.png)
+- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
+    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
+    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
+
 ## Examples
 
 | Sub-section | Description |
diff --git a/docs/imgs/warmup_constant_schedule.png b/docs/imgs/warmup_constant_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2448e9f2c7999497d3e2d252a5dcb22b0ac7da5
GIT binary patch
literal 9978
zcmd6NcRZE<`@gpo4Xcujj50z<_SR4#R91Gf_dfP&Da6}Kg^X})*_)!w?7fR@$H+bh
z=Xc$Be?Fhb_w)V!|LakX?)$vPeZ8*hc|EV^b-%u^bcgf=?Fj+`0#doVw;mD@9C;4E
ze<L{tKkN6Fn&5-T@rIl#3H<RSF?t36KW=wd(~*FH+yMFaSE@AH94?AF$v$#Yu{CjW
zF>o*@ur_eAv$A!vGJnE^GInq@x3yv7=ef#r?Fy5blarkoFYmv<z+>xR$}8e4o<%^w
zL?CzTx~gjoX2ioub+Qt_(q?pn&9{-{78~^q5_*aFG}Y6kw1gJV&YGv8g&wlHSA;r}
z$v6B-JzE~I*6liLUz+*!ReZe)S3hI_pu7i(8leEC02}RJRDxG2Pa6_8KG)7}TMB9D
z=F>TLH;<oxsZYz}z1!44^palN-KpCAO>e8Z^OxXJ2xJ<L=a(VB5cmp`Am$*TNc{i(
zvnGp&U>aRnsY^^oe_vHK!b&K*XA;@^bj$fqb#=GD*Vl7h2@=g*yv20N4*AZLCy71A
z$QN|8^YR|ve0eIjEGYgq+<frhL73Q{>FeO&q7quT|8vJpf`OHd&Aoa)5l2C?TVULL
zPI7N+f|G;eftp&Ro(D#-<)b970C)D>IZj?)WpS59RZq`~#quSo$=|=X)-;Q(-ZS{@
z*sH0j&2}pA_Jk`-rLC<wRnLprzXxv+@S}i5uD{SuEMuhpjh$@>y+q4z67<m<YhI1j
zP*t75jp3_0ySmi0w9?~t7fj2hKYA3{jELFB6<YRj3JYrlQuC^to2S871P$v>m5wYf
zijgAEwVX#|7}O(6L>t|Ls-vmFEmiC1RPU9cmPRqw&5ey6a3N=W{E1xW*2;t+YT*&L
ze%0`h>Do-k2T6~jKdY;|>E9<-8fKl+6B7JrZoq^@N2?|_=y_-`Y~R0sZ~F0L%{*Gl
zt4uvhEBWo)b2Nsk4<9BHGq`?1E1Q_41Ox;qxeceJun7tZy6JmJe9Y#i_g<m5?fB%g
zeRsWHE0gCa0YUuGu#l)`n%t30o+Bf#Wcn4;GcqO}N<9a4c%rnlwbit>xeS8_2M3h~
z(Pd@Wi29bcw!WL>lHJ^bwLZ<G*n7OT)17+YoWmwb-{db+9OEZ65XNpOerC^ZEgZ0t
zlJ+TYo3;6<TVNLY=g*&t9Ayc=<C5qV*FZgQk2sduY>wJUI|KbdE<yr=8HT7j@hqOd
zMuK1$PYeykTh@BIyDv*f=sY_{8pd}URlb|#RV8V;rAX#~lg$5<w#7>y@8XfHx)k*{
zC8^a6C8&s+8doe8Sc&kAGFa#gyOQel>wd~Zb#Le*-@Z+e^1@!ec=1MBT3Rgj?Afzs
zok?<%nw+gSR4k*_@s&5i1Z<l~YM7AWw0`Z%{^X46nwsyMnwnUguW&t3SD(_o_@pCI
zwyCvsW1|huuF4A+5fz0VDR;44pX<>ow1`}++|;v;tEiBW#P6(XRk|0#IcL2%N!^4&
zxoBljRMs;pkk2S6hwF5Z?Joau&1n7$7Hd5j6@;A~aT)6=Fz*5b4cEIga~IEir`A^m
z&u+u|H8wU5el~b^jz=fe^rPGDm6a9uxis_|Fa8hjsFbFrCLCB-XR-pgc5+0-S?m7K
z*X88oBwd~?YkQP;1aQa6&AE=hy)<lNFAJ-E**3UPLr_o4Es1{I7ge^J>_Rn3qS29<
zah7VJel85Y@$~eVy$4xT5TUnw(Pu-{a>{YJ=hxvn(NRL?pZ+eJ462P)Hv>NV&lWY-
zX7Uijote`_H@=Xwn-J$PM#q2c2tB891g<)f5;EHz^Qf}wP8ku*8%PpIpDHe+ha2(D
z6eNE9k8&p+<sMG4^1LaTAq>nwEKx@0M<Mbi{iPw+aKKxxvm%J{a3*e;iI#WAD@SWE
zsE(DqyFUTEb${iDL^pn(r425eaTyv3ONu$Qc!3C@bD5cf5WW0o!%J+qJKYB*Yd$DV
zUUaxNC5d0S$Y^Iu&(MZ^*y4w4IzrJcgHK^~UmF=FBBk-d&fRp$;UrhnB=6;s!y_s?
z>PNBlwg~lUx=PPtyof8jr@j@>24hH+JiCroB|p|E<g^I6EOt#tIGE1DSrqQb6vH0I
z9tU;&sR<tVy70Qz$?wwecmT4_;!P%^Dg%%2*DLikb6zc99l2)y?f@o70%S-$YXfT9
zSmcD0$U0;nkR2L^8rC1Y)5B1?zM^p=U(O{ZpF$F*ddi4gchnyf7rDf$RQm+0t%IOW
z`!30I-=y93`ZX+$m_=^&+Pi(Lm@!chhN)X;-M1|7NTPh^H1PMytOF}j65V)Sf&J8?
zwHmA)Ccta>{Cf{fLm=rd^Xl)->hCzGf>Om#6N!SxMnMM~m3#?zP!-9Fyyq~HER#m3
zqFPMyi<n?cJTWCnTTPyEb3{x`W<$f><dhVP`Q9w_g1GaX>VQ>|($J6h{FPodmFvAq
zOUm!ACa^0V)yH`?Qm2b|W<EcOvzT%jKRnw9ztpKIF85~ZW(*JOaBCOc3ujmE4WQx@
z0EER17AoBIC&P66>|)m~Y-~2FvLOoI@fkYRk_vCFm2V7MuGgI8b=ALf>z31)+?PAr
z<sD7j7ZenS-+#=s6UR#d-uSlH2tSsVT={mUY<FYGx&DHS833>QpGNk`ckfaso1@|i
z@L&9nP|?xNkDNGhqB}>wTFI~dm)+JHorv9I<rp%S-N{<cQgVhKUvICqaIVvb*0d+O
zAuS**oRO4t>;3!pQW*Hb$;tU={(OddR_7bx(bT<NFKUUg3b*`Y<P4dCyj71}M%~aB
z78YfhHZdRgZgW-Xy^``6zq%gyI1h=J1b^GF-l<tx0L9S&%F5-<lMkJp*J(D#at-~Z
zc9w5uX%#l1z3PH!%6`41i-?Fw3J5r{yE84_r>3Es92Rz3ls-6|AtNnKPEk>Dw2vWV
zn!c4pHm)bl)92Ym+;a|(Ct(Mt`j&~};|p79zs~a$eJUIpkW&Cp-p~oRq9wVwnsJWu
z@&TQloyY>hqpmmm^nG^aLL%eha{L+GXXw-Gg(bDO-m}cIo4=-Y`r2443|psIC^70L
z=i}qc8^Xkz9I>>p5No0J9_sjXr_iK@cErB^g6WRq`1m+7r=-V5OkP`}Y+!eL`_B<h
zF0Nhi<E|Kq_V_Fh3o4WF#e0l4`;p~qO!D0O)vH(cEiBUESUd+$o;V?3-kH?3h?~pE
zNr%T29`jtK!N-)9iK(fpbMh>9H#3URlgZw>gR>G5XJ5P!<EB}0ns_6BpILkngX5Z<
zoSfaBj_aH2$td*LwBgn%xo=afEh;J+mb9>7#jRD4+U1R$-oik>pjkUp+(xZG`TVHI
zlA67J?#ALsMoYA?WpAct!0Xo?A>M1B7(~(cRo#|TM=$4uv9v_jEFh6_gKm=o%fZE^
z;Oy+IRpykIo}L~N7M4g#Cv=%3ZNiTQ7Bn!Z+QtGk=_s-uDzKlZQ>xD&!T_D=Q{b&k
z!eZSa<(m%{m}`}zvJw*$-G9g2=vdX3*3d40fNeOR%p!8&fS)5;&KqX(5Kg!Li#jyl
zY5Vyz)I`z!x2@P{XLi)m`J6`lSwm+z*q6xh14bd+uoahBiQJBOZl*}fH}gXr?h;%N
zhETUFC{^R7T0%lSXxXw9AxljClq8g=z443GVon9JH;Fg83Zv?@bjjdWI~N5B<(k&e
zV4;I${lhaZDePRst@R9W=WmkdB!AgPXv_xQtQSc2d2Vx8q2K_Q`<N&$rqUR$E%cr2
z7z04Oi0kLhCV?R{k}$?Xd$ZfZCGw6Ju0K+euW`-+ZusiGka-oqVfD*2EQzk3b*{$y
zFkP(i5ZxfkBCkr-=rDE8^B-`!gS9(IweA-PnV<Fsa9`LY+k1Sk;xx~KrM)0bC9`ys
z>DK*04VPO!9B^VIf-eS;jQ7(BmX_x#_RdCKQrw*m6cFPXJY4!UEUn^EB3JFRc@i=e
z$E7RTNU<_g2_wYtDg_#fSs2=I*95jxjAjS<AV(IK5bf+vj|$k73v)=>hs=_TFerZW
zB0PJh8~Pz>y;cN$m;U>I88QQGETA<wSZHxzNb)Gjpo6VB!9uh??s#7^0eX!eN6B=h
zg}xr{Ed=%ki6;O_JU+1!_d>lE&S^keBv|tX!@Ou=^hpk>wo=8Dl-n@G86E6S2F!n1
zVn2S-6aGyDIUxrrz|T+Sm9NH}R1$`YxToHx%UXj?d;g3nuEGXfA((c4xtuvW{f{Ak
zA`oKS{FXyhR5LU*R9;cBr#DMmd0-RhOhk*xe8B~2@12;C*R4|cxzBsH8ig$8qY=Lj
z(mc^Q!i2bg;9jrdQF3y!ys~m%5DoutRJn4gvXfH*#EfTALqo%=7@KF-Ft#)(G@C~9
z(Iwpn>QM;gAUX>(d~_W@KH`nTluZPS3%IYDNpAlRm%n$f`_E*va%o75iHem~W*e%i
z{+o2S07G9{*lt*o8_nVQkx2Zn*ey2cjj+Lo55r!bqDh?3E?0uRO?%zjsXgg4mN8X?
zN@gQ%r>Z@6;QkbL08syp*K(UQf)){ao-4AId)RsR$w-Z}G&E_A;p~=eA0?+f@5w<1
z2&<o18MyIaV&&ZK2F5F_#07~o5e=f6{KxqNo_chKVui6+4Qhx+JXc$g0tM=gbpXA&
z@j%{QTkq9qN2wH+$?56lSts}6Z{NN-N<@FzXd)}74Jo$xCcEq0V%GQG1!g2@KknE5
zFIZlLg~eKyW91(`dY4@?eq!m=uTlm_V7eA&X3I8_k&&iuLVh`=g}NC<YInn?6loGZ
ze^wzjL*_x>W?H*>>(=v!Dk^IC@Bi%5yMI4kE}U)K`IKQ05a+Yh)Gm7UP9pRH!ND5?
zH(D5*H|GB*{UuZ{={ZUi0HnDkR$Rl}-28)xed0%tMbXj`8z|lGhUMkwNB;eH{LYT2
zTqhJBNypEynf-1E{o^Wu&|p<{^%P*mbxVec`5UNDRN7+a605Bp^5}n|_KOMvLTcdh
z{})RRV<{fW`uy4FUP*6XADrtVB-X+hB{Wa<j)V0;p6o>3n^oMqnT3`Ri#78W16k1;
zLDQZ42Vg?@;eWy`GYb#n?#iEwmE}u+J6yXx<45!9fXa+8byrPjz4joXygEAKaerx1
zJc-2P-JbKGtA|rD_)+{dF;EliLG0t?_EhH|3QZ(M$V^y+7JM^UfKwQ->eQvVa*G_y
z?gUhvu*OkeT64|glP;}&$I-(GA|_;Zubg{1;hyDG&B9?iM{>X)pA!2~RF0nX$+@o(
z-BDt2L=ZJYaIo^zV{k{SHuqd*#R#`t)BC2%XE48v#zl%m4crK-m`u2E?n&R#5AG|E
z4+orp0lh14g|hZMULe6sdE#WgrVb^HqR=6EK2x}Nzf50WP>7?ICQZ|D{2(KdK#pCL
z$6)HpyT4&H*tjmngmA)u_MWY`Z}iM3vU%6tFqE=)Ue4Fq-#i>44<p3t&hJwS^Tls0
zx<5&Z_tBC`gp#CVej!yPHLuBC_PPITuUz9{^h2589!mvmcx$4Yx$JV2!2!50BM-A(
zQue^sTlG@3Vi{NLiy0_4d_je{m6P{Gp}z7N|HCJOS5m4EFz@RdnOAxfGP``0dMKHP
z^*7dFs|000!7J<1t*I=mg+GXR{$0nScL?n5mtcA`k-<a_R0%7P?Sr75ISvnS0*~9!
zyI^h8mD+#VSjRK;fMzlK$&fT(ur==rl=Wbh6Q#Lils}NkjpRBKAmWb}@sXYhgS9R4
zC7*Ll^v;<c0CWQrMa@HN%x$h>)RkL4cii)^{o8ke#1crpupKis7|1i0S5i_pGD?C_
zEOeglRoT!jbx20^h{s<DNY7uD>C<t1gIehS)ivF#Z5!EQvO2(a?_K){x+7y}tknXO
zA)bB`xBj4)>h*y!pCeHEe}eI#&+1gm2Y%DwQTKUnDr#!Wnf3$_Bf9r?ab8$m4@@Bd
z^&cBLA*-3!U9(g{d`Kn9jD;$woA=^j71aKOzwUfSAPLb#eMnz_u3@3)wu9}^Ovfid
zj}0rpusC~sQIQa`b7kq>0ssM4=N}YEl#=^w)r$5_uDJeo66<MikDr;Dag++C3Xsa^
zQP!~0Fp9gnyStYmFhl6y!wSb7PfhZBsl@Lk&Hq8l?}RX@DJTRUBcq?`s&ZR3A!U$w
zM6K_cyI8TR1oA`7h6u1`02?XoczBA5(XVkytf!A5<?2b$q3&yIXTUZ~b$H@XQO(J7
za~9RXD_@R~UuI^$j##|d8fdxu_GHxhRJ83}H6B;J-YUK{lb(?c=hVd!5*`=Fi|(2o
zIeq5HYE`)|vBcn?#{CLUr*Yr!e5p#z=dXBrcu5!r>03@oZuA=>nWF$UDtr63`%r#f
zUWGI|Dd{pdw<1*i?h|i>op#sz;?^Dop5uO~r}uHQuzuor!~K#2H|GuPYrnpGz*R^T
zA=g2YXs{`E`9PYPmNqM3xn*6wx1O`UTuW}*m7-X-kk_m<B+SF31XB;@o3?S_fJckc
z`+2z;Pk#M`T_sk7+^Mo(T@qy0uzvdAXt~Pb=GH{Wd{p&L@*cLYnv<Iw6Xc_$s~Zap
zA3SF`Fh6HdJppR#wNO5JIlIFYng=PgskympwM{ywCqu(>b8$qg#4cgjrr5#3fkkd$
zU_c8LrDD8ch-3;eqv!n$jhx=k2DPfXx>;}r7o<ENg*k048;p1?RdVw1^o~|~6oScG
zJ_ss6F@gn>Zqz8c@%-7H4~0{)!O3Ef)DFSIY$EJ|w6U>qT6VUQLKF{(Dv=K#J}fY(
zJ;ut)Di;PDsHxct&57dHPMMlA+1;v_-Vkl&^<Fv2t(kWVRjvz_g!^oAl&ZddPDzP~
zVziK=WnZ?Ms;X+pM{Q;7*tpj}1bYvC{V>u5dYMnz`4!x5&Ycoc^j=fEAkmpQ_M=oM
zzwVp1?rCr|_XHViFx^av(OJo<P1Z;;*Cfr?M{pLm<ueZf1sESg$6#;vbdeK)g9g^_
z#n3haaRllU+M@2?UXo|5e@`%^?U%kGtwdVkd<H?k$AEst{B<(}Ef;`si<!T}-acg9
zzHo=Uf5SWrrRLUM9HQ&U&Z4=``FwK#dNJx}QN!Y{6D7AuXBZERMxew*k*FQ%(-ChS
z*W1Kk(ghfQz!H6Jkvm!ao%TyVGn>Z+jAWz_GSV_QpHsA#^PPf(3i<z#9JYiAnaf&M
zwYj=&l7a$xJ7-GF4)V@2Hw*|ItW|VhACZ@PNS42!r*e>oa7{DI`Jgo<X!1(s%uDiv
zdV+xcI@7HvaiH6Hj#2hBLsYxTVV3d@XQoJO&hq+iw(Y9|n^h<*D75S;Iw98?J6mf^
zZiqq{*i05=wue4@KYcg)7*&28r7ZKddU{}c5m33xf}m(WAsxlCJOA5Ab|Y7NKOOpZ
zAq!dU{pRvg9ZB7zM2$#D9aJs^F@O0{=$gnPm5b=a%YU<NkuxyaEluOf(Y>Acnp&$e
zNy&kIQcti2Je<e=32NDYO!0n9e$GLrT86wnot3!pYOgF`3$bF!;dMw>)0k53xaRY-
za}S<5=W;b2PV-vE_1s(V-QTd$+qXFd1=X~RZ6vMnkTGV%6nH{wf3>ql=_K~)d9+M>
z83w(*qM5k0<+_d;(ojJw^|Z!{m#>~J(@~tT&+=G83ZH)@x}R2vD38KuO-@Z&tc-t$
zFjsSTFEjb^j=6Zqt8)(#tDBlShf5vT_i#&=i^HYZMrH3r%uulhP67e7@z?BYYhB9o
zu`Z(zfS^}xRxYLG=H{Z6<|`_<HY+V`to!;CsOJI<4wb(znx7Ap^`|Qn^_C0$xms18
zrN!3&R?fxO+nSnyPj~3={h_uUD(utC`0L2AnMe)2P(DOMZE9)h+1(uD-KX8nlbxSJ
zq|)71r?%Ci>L)_GW>s(Ae3tm>Q`_p5w$@)M>WV+`(FlwFKCG%(El_j^?+I=}#LiVc
z@sierWVd~II1_{xV-pi>fvJf}dTQ!S5)NU%bJAW|CtExSGHMzcxHWqb`lf~kkx4#}
zXhHE|3yIP6U)1Y;o43mkc;Xk$6UAk|GXF{{RSETz4rw5}C*f2zG~$Zzukjx<+dy|#
z(o|4U3IFinLq(1xLkOO|^Th}4q4k|!aN7!wTB>s<`AJA2PB0xzgOm>2FBuH)p^Ne{
zK1KCE+S;PJX4h<9zka><c5HI73L7WD;}&nC|M9Wre#@fu!hh5T2a?TlECn5%$KYOW
z?uE^v*cLuk^Z%?4yrguQFG}1Kexux(vC7s&X;Ykp)WQVQ{xUQEuU?UW1ehKl{{>wc
zA1@QmrYJTm1;YG7w-1o((LE5i1Kzw@pX~w**3{IfFsRIxv|SYw8#F}<I$y`Hk_U+Z
zJO`2RU!P&0E1murK7>(MQVM3_5Es`ze*E|-doa(V&oU}0_d1YcWZYZ!$4)TfQvrU>
zhl;Gj#CWJ$;J9nar87e>-LZh`!F}tOe@~9S^zIs3x^O`P!OG`kz<BcV^1#yQWQ8bT
zF!CH69EkEB78b@Lw=d7&-`T|qn1_MX^9jj1()jH*!6)BHvUJPUfyQiZlrB}_oEG}|
zc5&7Ca<U6zNr-d8b3!L(Ze<mDA`16M{#U;U3JA=AJgI7I%o-H}ERN~2>wbGHy{bx2
z3Eeg4@RXMS`Cr+&xevmwcnw~LaqSmq>FAC>9|@wVA?O<&%|1d*cA1lNuAo`X$^Q)A
zp?1M&#$e$lwIO5Kd0hMVet#aW`q6)SZn)vMFS6@T8Tg(&d9v$o=9^ct{Lry4dFrH`
z_%;#(bIEmsyk_z&7T(h=(0sFowFpQa5ZyTV++bCBdnLDwgovknBs)z7%_$L+5pt9C
z-^=Vz@-@UC>CqC%yc!RCa4_(n3I6*tCjY<mfh?Zm)6CM+`Wia+FF@E5a9w^3-8^kZ
zpPgLgIOt!Ecx{e=w)OzOx4Zs~aYshqbLE9HBb08M`6g*T_&r2-liVEsG&ME#QGzN4
z4YlHcl@LIY`_OsZ&q5!(_kO`FVD>k5eIC8!y;%-@!<8!AppGzBg|m!|IS^77bKPl!
zrH<$}KIGg=9jBXSo#3wpKvjB9&eE*PS4}f(>z-%iQmFu*2qJ<8*av-=p3%`oOl?20
zKS<POhzJ-G0%~{{I3aP|X1PU7b8~ZKXlMc~!3m@gI0aQ(_9=>EBE+B=@_Av`__#E(
z1<FyFYbVcLxw?zn6vk~}^2-EbKFxNaqp9^=zj}hNretRR_z@%8b?SmxH@%c+{}4Mn
zdp!#a%e$vf30Fagd~e>B@}AGI?p<%Mx?vDsR#p}pE9>`r(Lya1o1-2n5Psk4>!Y_<
zr?s86L=?nP3s-AjlJZCCqTXwMe(YBqn@g@lOwJ$xJZ}oh1HqwUTVX0HDia`qQ(el^
z>l<#rL+{*!j;?00O)Lj5uP_wUY&<-z;I+1KmN}KZNl8iX5)&_~sHz$PUHakNr)&D1
z<Et`$T_10JjFi@B59H`*K*3dLq>k#?eLXtW$R4-0VX*KF_Cfb!{Ci+ane%)Q%j%$I
z&UVEZ)x>an`(qHhOIPaX6y4lPk#%1O1YAStsmFqZ{UXV85q1bHe$Z&8tUuj;bB$bL
z*;ke-Ap<!3dAXF&pWDwH-t+RR1S5ukm%J%%gW5z96prnAeY{1`>{jxbbLUJpmd2Ja
zt4&-v=lyF9XlZHPH;C+dA<$aFv$drO2#zTr?*c$GSP|NYjSUUU$8OX~xsLeYcc);C
z#3T-b!LSJl^-BC{xHMV>2tbGKGXLncp$SKToM_mOA=_#UMny-*isgNX6gCwXQ1A9$
z)G?(LQ#9l`m|eZ+?zq|_OlNo=_jny6)2}vBVKs1V!bz+JaHG--`oD_M#5T9JU0Up_
zaf8^3DFo6P84+=rpT7$&$<MDMD=T|W?L~HPdpnaHt%zMbOB!??xVgEdCK$l2LB3xq
zp{AzhVr7+uls78Xg;v&p#L>oTHWRxyA-#8#-?X(`uuo^AtF@mv2NZo>5MYol0uq9{
z-rJK}P(*I-9;UCL%GzJ|Z{L2|)-N8b^oCJ-bjY;k`Edua*d3*r@7Wc9<aaT)K3(i_
zo)_iZ@w=Oa)<Z&Gc-&ajw|@VCY}#q)a~m5OHT<%3#oddOLf#Q*2VU+mWn|Ziyg1;8
zE-F&DwzhUC)se#RL@D(2^dOIB6!ZYf7-aV3q@=rH(Cnn5q)dQRzO}YHRidGx;eypU
z1yv#?C8ej3d#RvhZz`AoaM%XD3BkV(vcBc^pGl;cX!G9S$Momk+#Gfcp(pc)nzGW+
zC9isE6Eg%9aRCBgc((r;F}cZtM=sK-CnY7N1<7s!RSx~{wG91TdB>?Ap)7S$x14%J
zU}ekx&jzu<_T!bmfB(+rxuE+>=9Qz#KRO-r)s{GEeMZk^KM?a#byZ^wo8ex^wyf3=
zMNJEWPRM3B6&hsKn}!QV^gTSvp|!FGg_lAYB0gb;OI4bqcr{R{qT;dldsU_jN7@Dp
zE!T&}_SCC!BTG(DW29$h-YYFF1-$9wE*>7o)c5<p%sxLIeqlltm+0~f@s@Y*&JSQV
zl<-(M^{!d{cNDIRV3>XVaXzb9z^H-3)&tnqhv;bMnRe#5y)pXD{{DWYxIpNwr6ni7
zXuHh|s%PSd544cPIiVB(>(?*ND_6QzlqHjwm+heJt&_S7+z#Q6E9P{P?gdaFB?a3@
zIt^AY85$a{Pe$pZ#5Oeay}tKvKEZFt;nU#l1Q}=_eX3fGT`~i3ManvGbXG`3i5Yd<
zM%70^j%0EE=yeN3`EQzm|JCSW2~4lqzE)?px3jHP?OizvQm9gtOA?R_<v1y+2|Z|o
zlS}Wki!TDp==NvyPEAkew2gx{R?{!L%YuT1rb<Mn7!Xa>0ak#MR%mN!eXuE;Wm{?s
z^0NCN;pTvzG?v=_eP+Mj3kwfN`hWZzz#rhj*JyArUmgP=svR-~m<ECLr^^jq)Y{C0
z7caKGNehNBqsHuzfZNq3B<#m;frR+oG%W$v=9H4E>Pk+Ud6DTFMoKvL+L#?)*)T?7
z2I0jdM@7hMEf5M>JfzbzV|$pfpsrbgXkl9+s6IzAAc>4h0nC{+{LQ=%$d#L;{2IE>
zJ#tYDCD^%)7N~Jeg{%jSK%1`VJo48}v@(t0%Up7G1m)qprZlRtkpuarA#NZ;C{}y#
z{;>7IY1_u#VQ~8<604N&z3Z-`r)M+u<Go|&olbq!fN@GmNk5R+=q1l-@hg(;%_9!Y
z+-%IuU&|N1LH<;{cP|LxITc<IUhVz;X7UOOitz4Bs1c&Pt7vJ>>2hXr)s<FNjh5)}
z0Km{A)A#1++XA1~L|v4jAhB=#=r%<Y9uZ-5@#4iFApTS~pJBQxCpNo*^lWkT;;0`#
z;-tq;(TlXNVTSs2Tt=MhFen_o=~$JQ32ZdQcC5<e=NCT{@Byiigxim|P)tS(&-&u_
z*4V?t!k$2IBU-!LV(GN!sI9H7aP`)A-h1Haz`sqJA~;eiDu&U@ph1+*rbHuY4M{D+
zBTnI>PBZtFKOSp^x`7W##}EdB(8e&?TK;7MRZU7-TH}y^E;*Skw6EoSs>i@Eb?>t2
zrpcwYz9ga1bdW{h#g!Id6^|S)%Jc_qV}q6ViP7C{e6%r*d~WF7R^<P3vP!qIZ#;hb
F{{SK>_Jsfd

literal 0
HcmV?d00001

diff --git a/docs/imgs/warmup_cosine_hard_restarts_schedule.png b/docs/imgs/warmup_cosine_hard_restarts_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..be73605b9c080cdc7cea8b4ff7e29de90db2d9eb
GIT binary patch
literal 22350
zcmd3O<zJO))Go>>2-4jm(%sS{!Um+fySuwox<$H`?(UTC?(XhxIQO2J_na^9A8`B`
zoAB9puC=aM1j$H=AR*u*z`(#DiHQoy!N9=!g4Z4JufgBlPse@WpTFz{#T4MdFAsQw
zK=AP!8&Oqz7#I{?=${w40y(DOlUxqMDh~42Mh?!pc7`yPx(+rL)(#e?dheVJ?d(mh
zt=_RRFfp*uy)$ueu;FH8{D0rTU~Om2`0*=G5e&>b7%`zw3NFb9i_R(v#&g}L#|e1;
z$a0u6sKT9EXob0^vALrPl6tM!VblfL^|PMNQ3qD&;}YYzuVnQ@v(3}RBwAl1yq{^1
zQ$Us7O-Z;$2}UQNvB_RY+fUqA>ux{gOoC0m=N+~e2p%<Id+&=X2v6#Z+O}shf&A{j
z4+M$iP!Yf<p<jvl`CSJ34d8WQG^X!A-z*5Po0a(v^S{@$<gb4J`w&4DYHE1D{~q{7
z>=V_0&r|w>Ao$<oO6C54e00j(i;hEP+_%YG_OFeNjWLB~dBXplM$MoJ+s~vV@$qcQ
zQ4@o=1^i;4OtyxSxqQ;n(&m`|onNkfUS8hK-CaIWdph*X$tfw{f~~P}aByPA|6MxU
zdpkS3lk@X}=4q@;5rj6ixkgu3>y}53)6>(;tgO>T=VfNMi=BP1!EDK-cvfTN`|INz
z)A8)ao3)UEc;<{QW<+edmBX543VQmma3UTS?<Wtz&!5dx3d|C7LqbCS+n2WVo}amQ
zlo)pA)B*x&#hMLgr`;sedpA*((ssv9*Xa28UjzkTdOzPUyI##2-)<#Y=SQeExv^Im
z4X(u~Ymt(Yej6E)@kb(neEIUl)AQ|rdzPDx8wSakBOM4!r{W|e_1MjdQ)zIbheIJQ
z^s?>>#XDGPN{Wxig>k#w6OJI}Q>nGly<|R|uQ9&Z8Eri6BB<*785QMIP(V97JIf;Y
z@1dakzOc?gn82bYt*57l%j;qjcJ2QbPte8XV^&rcc)PXrb=TX?ICDZ)mlK`zjEoZ3
zrL{G^;E<4l=j5m;`I=VYq7BTLvnl`28#r;g1>@<{rZ#2MtGfmvIZ^bI1hZ%>yS_;o
zT8FuU6M2pe!=<NxR4=%Z+)`e{5zrG$Rl9KSny%@IAfmu|EE0~G83ad^66RDO+}=cF
zV8-B=PMU(>HjX3XGhz|od4mfFZ%ht+qly-k-jEJNJZ$7s?(XioxVlcd4EOhkl$W!x
zu(C4ahegN5X<x=)TwIu^UWaimtxPIw7sU;vUtvGECdcqt@m5U_w)%hVl5#k)CGA-b
zBOY8lv9F1{J!!LeG(YrW*of1(#%)saX$gR}Y)sxc+J>M!LmVvjf>Xk(zsVuU)=YI|
zmD6xs`ggn5&7iOB3K?rFyRlgdMQmBI*_N4q1((xyKl5Foh<DR&Nn%O?)t9j6$7%gy
zWx9bBo;WW1ErsX0eAp<Hk9ERXg$t=~|73d6pE#6f)mA1Kw#f}`$Jj8G#LMF#e2miU
z==?6X7R893<`Ib%qrJXsAl@=Xyf9n*1{RiJyyp`H&F_|qh8|Am?hg#1g~LW%bYz0^
zz-=S?&=#gG3v1dM3#<t1*~i4l?z<)|l-dh5Bux*^x(rQ>Ba?aGEB<old?pfML#$~}
zQAFq0f4Rvin62o2dXwPVncPoLu_;IjeKnZp(wY2G`?0Fo>dLp;bRgw0aipXQ4%w~l
z5?wGJ`!-KOy0NLL1b4T;r|0jS@zvGUZ=)tc1(RBr@h#!_T0>J)*y`%@ZoPD7+aW~M
z8`yE9^+6cVgi9|5*|WQ_qNAbX)W>r@;EIv-I1|1~>!n~Vi6=oHKKZ<c$A)lWa?};b
zz#n=rhlxUYYfUSmKAC@R+w^d1?_9ffjP#leoEEOggyC<jTnAt3L<ReVS<b)ABzQ!o
z^y0YQ$F<+o&N0{e;AAv#W+Z8oQ6u}A7Bn6;+|`|3q*xZai8|?0Qw}Ne!iz}^>M5TI
zBc}~+PIL+?b!L~+aJ*`g`n;?}yf9rw{h8Zsol0K950&C?FZ9VMe$Ts7tHpZ!<Hx)G
zl8b{`3<3fInD5`e&(zwIT`zlkUosPLS{Y2_$|5ByPH5Sxa-q#jy*%mo`qdoU2L{=V
zh$yLktSpZ<qn;g>2BM5EUlGXd5iWpYXmev>bt!qaFF<6VqoaRxyc9}G)HJ%E`KFYK
z&wp<gN%`454AMHh_(H%L?)7Vq*EB}_AJQ0g`pYEJkqt`^T_{Xvh3P`Xj8r99==q57
z7K`vuE-$iF$?pB{s01g}CLK%ahI|l}-3jVBoip6+If-{wne8cSa-&nkaBk0SUV9Vv
zt$^vLBd3w=op2pkUNP5Z;GkTcWO<tp??3UQ2?pMMc%@6fxw&b3FjKZCkjCr!v!g?3
zVR6xPsgcWmdua1wcbrbUnJ{;U>JEfFR^uT;5J({s5^uq=C{(M;zazBY8qj^XIR&c*
zUB;7H<Fv@1KkYiT{MsBIe%Sa|aCN%W&9Ky$_cY)+aj3b$xs34q@retlT6a()Z8;F%
zzV8?ZM`e&CfW`_RpJ)EerDThDs(?!}+3+!liAd2h&)P^xS{75yg?p@RJo2>Zk(!2{
z^3H`q;6v(a+~aQYz`*w7a`;K>u?X(iXi|`;`ba@mLnuB%l<Z=}BL$<FF;7yk>DXc7
z$(|=9Tv00HiD4B}Y#TA(IOH!EZkFE>0w}L<*DQ?Xw{QJW*W3)|gVch9gV(}&j%wE)
zgD@!3@$ke{RB*+_#Ds)}6`vQOd4$j1>9XsBvFTz=kuy7u)0)`S)YM^bA_yFnv=^|i
zC%EG(H!MHC_K9>~TJAWtzkPUIa^hwFrI?cXtO8+JCQsffNXhBxDcbp9o}q|4TSd(Z
zSC#FEk;DCX?OTiWt;FJ2=EC0>{mx^R$dL21;Qkv&Z*Fs^?DEGBzrE{uSDy2YUrM1n
zV+!1{2$M}u>o3}BQVqKPC^hmHE?TAygS;A_*jdvcb?(__Y8TQEV^coKOfeHd`TDOF
z@<9{dm?-xJf3jD<6QZnR{VPEH&Zb`usz?dni90T=$*Cnbe9dtnuCMBV-XK^0HF(#R
zte$N%fkz!<XH$dmxfr!-bSvgEI+h?P!JU%7*pa8{+{TR|a#YMC&L%ir4i@G`YB;Z>
zh&ja3JXnf$JlBs~OWpZvpe3cNoeZrOzT5q))lKM>oTjPYXGDFpWgvlRl4{Rr^%}sP
z$RBDM_GQAci<r0Bs3}0Ly4cNLhY8(3)Gm>C14#y4ul4za$D7qWueV1lexoX7&+{wS
zoaYxf4WeAyGS<({sMGS7@UnHYb@o*wQ}YR&#5)7wb<XDteBapYN>_!9_=i2Kk{Xug
zJL(sWe;CILB6aokj$JY<9U2tq3#MT|aJat{r+kI6aSRbl_-4r5`!*S;v&Fn@G6Yp{
z0U1<axqjec1xhyuY8B|pEn_2H9Xnl`cNk;1l|RkO9337Y{fTfNtKDy+dOBZ!QI7b9
zfGz~pW<8t5v30W5-}uO^eJbu~BkT8}jq^m5NAp)>C32hR?<P(Znzg55zPX<#Q3Ur+
z>!(dHq0zj=_+m9AuW4s;Eaj&iELh;T9{1ANmfTuRhhR7p<o^sOf3y|OCo!+Q*CrsF
zfzE|D@N(E6eXc8N%#KXFI@RFMymvVrcaWV=XkwthZC;+l<FmR{HFqlEL@+($QtLqj
ztnG+72$gHUrS;?MSW;P4{Vqua;TaCeJen9X9ED{u@2am1E-&~!__J^hJvDmhY1YG#
z0&4dcB_A3YH6eX|sE*JQ)M{mCR>O-Y{#*K=$`KgF3I~(DwzFPRAt{9HqBz!BzpMw4
zh9TbNVNE&Zih~q|pFO4Uny^LGiM$&ckYtBZ)<#hU*boYrlZF<LE%kb1F`{D#@aaNI
zrD|pjE~+oB*Lx#|FSb(x8rBZ=x_1i&jXa2lqYZh)n%;@S`Jy_0h7LhzbPEIK?e2=?
z-};w4o;1O2zp5owo@->ZNG8#Yd5GMljam1iiXQd@JU6g=+2mxOCuY#ngF=X`c}M)x
zQjxN9S_oc|Z$512_=Mh=&^E|1aCp4aWe-K-Vtc|-3v2uQsrljd;azH71v`hDp8&F<
z&G3EF&Bm~PbRR4lX@@1K<^%Ud`x!O@I>uonNOF&MR<upo5!UZ3SFX2Tv*fGGzlEq^
zM-7bJst59?<~i}|eq%_>&fbEcf^}74Z9=i;n@NBma}a(R-u&%1+E}-W+G$wc^<|>z
z&Voi>{mPAg7|E*NzXj=+^H(0>tk+9Vn@rb?GhLAtbu<(hg7G}yJk*9~&HtpL=R_YH
zOF0@`oCvp9luSOn(XnvmPa8uNaCVn6#yXo>BYJQ@^g<Bd*ixBD>5rq&Ycg$wWH3?G
za<vTO^}sGB=`tx5UN#+PSRU}ukLL(4ngwRFR84{;UyyydwFoxB%(}gKoGbLkhBkzQ
zDDHAYFi`zHfGX2JG*$X=c;sMQ>~t2b1hq--YEBK>mtUW#lx)EWK`3pH1Jn&@QgT`S
z`)LH_XK9z(dt<M=9P~$T-tnef5b`JW??_jlwpxoATK>*uRdiJP%odY}BrNN~)Dx#|
zjIuK1BuPchdD7+onYWzEKXbsNY8_OKp`b?BXzwHH-N1?)o$62cWJyp!l%tev;rMX;
zG(?v~_`N2Ch?4c(^=2Fw8$#6mPCC5Pvi<?VtOv}BIWvQ^WdQ$Asnk+5UTEX<JQ}fP
zqb7G`Fn9V>^k~7UT^X5&p7a*q%qvQDqSIa+ywg#osU!XjCpJ=FyhyO+1)y4^)H6qy
zDRYuJ+_mL22YKErf$yvAVxpWC)7jK;0DMR&8Ec8kp-EdPp1G1QMK_rBtBC6rpRz@p
z>PfUXhe&sUg*K9ap?o6AcrN)wmOJI(^b^ri5`kd^@=QM-*G@y;2AXem%SVx(h<>P5
z(3Ms`)oe3mIS)47GwbdtmYfW?7uPtnj2Flz0H+9#hX5%lB6)oYrMmR7!?h~o4Xa_J
zMd%2ob?EQ4Ef<9vqlX&{Z*ER|@`7K{y^%b($I@_A)KS!#Yf~CD@RAp8(9-=zPmK|J
z)`MRb=*i<<o^pg6inimCc3cAR*`Xax5bB{+H`x(A0%K9Rb#IwYg%!r><&rN@l=Lw&
zQhh)_q^5#tuhznX&PpvTn`N&v7t0$(q}i(+CF-sUM&~p(D)f;Klm^0kuQ7~z16>~)
zCrsISIcw=@DX!bbdt=*;?eT6$IlLCg<VF2Zr;VDO>jfvG!r;cZ5)vW`nvnE;9w)-(
zDd*gHDz0|DcxT#ie9)HeZc(L6zWn-?DFIz;4K38fFZN;kWs|12(>>vI_$0m{$7xcl
zhDP%tcG5FdWwUUOwS-abKToAK7x7Zj(QofEqPy&Ge;)HwfVWiyXxbJpW3;DD5-Y=J
z-Ti*uq(GiacF}fZ-(`E1p^?8HgFCEE+I`GL=M!5V4#Gq@#egrD1YT1jsPI(p9lYlh
z!hc3+R;EnaQk!+78+AlaqiTs`81uwRBd-R%z;fSV2<0By==lh;grXFMFB3cT4HJkc
z8B2NC5kaA<vZ=r-o=$}T-uk?=atYunuISYMo`@QBPYI5498M+szFfRTJXlZ{-X4L3
zxVs}PR`)HgdA47mWmzQ8LXy<16_X>o1&h1Lzh!wo6{X9meF)N?O}34D*{c-Z2e}dq
z>=y-_X$W!(@}Krk%`V)t`N`Q&zLgHfhvSdCYwDv8`hr4q=4*13;p7q;VH6o#PR0_Q
z)pUp?C~I}@FJGvX@g?*|+HPR>1_sr>U?n7NT*$Q=ym5nQC$g7Q?s~jHp$j!i;)TZ)
zRDeduF9BorH|Hl@YzPvFNHRQ9P3ChwsqEwTHrmtK(93r-NW65rS<04#Hf```+8>f}
zGN5Ux!uy-@ibJiMaBySQk2yK^5uwvjLGyNtgT<>(YET2Lw7_@u+DLVUu8u&J<~eGV
zmwu>VC1npQ`l@`|xo=0>#$`9zkigXY;q)6$2U3b&+}sT7KOFW8%{F||h-n>2?9&+L
zBWi?(;II^0Zsw)uUXH8OJ|&)5CRzW!08%5b+4WU}{$T1RT-DomCw?W@Cr2e`?_}W3
zDZx$r1G3T7QL#xspXaa52xKiOS<F;|g;-Y<r%ySuDh>B<*V~<Km|KLvk(|(I_8%Fr
z4t>FWUw`%}{|JAPZ^CijlaA}f^Db#7gf^j_-?z%pIAha?dC_GrTnWbGqMO;ML@KjZ
zg7_~4-6B8?y@X%3`^n-H2@dSe+PvuD$9$I$GZ8&vq#x(^0>$fld=N9Z9A&wG1dE$F
z;3+bTXqrv+q?V)|h?@{dBZ&9A>UByfb$)@9lO4~Z&WQb#A?u<m%CqEF$_mc%b&zp9
zE}D38VGdRs4Y=inT1ubZXe(9!N@FgK=`)>}1eaxbpwNWZ<~?g#0CtxsC%0@7Bu*9)
zmpX4v&-k;D3X{Iuoi0<`g7tOzyS;dc42;Q8Y!<_dlCHlvDedIoLE%xIR{JL-q$Vms
zxzLG@R0uZeyNpT`7IQCVboBUAx8Ddys#&+J=C3$bZGqW<DamBfVd}M2Vdf9nMtMZy
z@n7*ZA&TO%0v46{OCF`HtZ<6Fn2Fctx=%GvzcBq?mNufb^+RigIO&V7cG+Q&qIl0t
zLkE!2+WNpjFI1B?<pO0C%m6N~j%)^TO?&rbYDxC>?*eVIa)d9g7N<_t#4*^LQ$spz
z!(qGT;<A`pgmdA_N0umm*61l+SNHj%d$s(Oj>q7CztXx<aCrA<1|dlxdJkptPAKS9
zO25X^tLqsXi^#~Js#jaoZI$`FMiP{brqyZ>Kq=v;prbnoZaH3TaQ^IpDZ5*6BcD9X
z_*<3C-QW2Nuwr7i_7mrw6IvJ6hhm?^xT46`XG%FG4^;$(W4XZ^W+TC`90t_?nxyg|
z@(!j{KKeP-pj5m!H?4AHDE+rSB{a5L8aM@_DnVH=wf~)a>YGeL@wh*%0nC_U%hln0
zfKtKcY5+;G?ZU|K&*p?RUXOSFHy>Y8apI~r*69@lnf|69HQk(FEg`gY^bNS&SQ=W2
zjwW!~3(*9{{ueMdaz#l)x?1eQD;sG6Z2l!!P|0_}8YfhHQ2LR}VX77E1(0HUo!-|f
z9Z0LQ^K6`U&IgIt|AKBkUlh-Znj7HG;#mw4<a4C5Jnydt(ga8Z1O%`dG+ZXv7Z(>#
zuda%{EEj6g;NSvi5d;D0fZtu$(7@TzjrPai(==0q>JOlOM81PupkHp`sK)PBDdN~3
z#%1nQ0pVb+mQpObo|GKbdF^~*G@FS^c!5PQBOM${-cl<8hW=_2<+H-jC~Ce@z{Q)B
zcP*L8Z&u4X2n9<18QstCrPzR~3yg>`K3QpHUn1dkVHzDBt>P8|M4Engc;{%@mjAY_
ztSp=9nD3uIe?vhr_?5DWJ#EB>uc}ao_UOw`Z!@y5AZ0LDiSGwRBI`|w`fP0>+Q+LQ
zL}Q7|a~k*`%KYco5IogfOFhoA7(#W$^?RezNA@)40%R5mkvd0}M1)jqE}iJ2-s{iW
zT_n$y`J!G5f$i-X7|3jFBxGbP!9R9)b&X6-^=dDP2n#b@ZuXQ&;jZesZ}&&i-|UNd
z0GPe;{-~j&v(t7v#aT|n<1lu(v$Jz!V`IW4C<({&ffv-p`o<Z$ddt{m-OPyiVQt<J
z=l6wY&m}G1HF1V9NUpw)Olq_6x-qHr3#!cIe;GFf^Ko|v9KRE}GOwVCs}EHhp0qBR
zE_V@>KC+nh1qe_+_Hbjk{cJ=HUi`zWRmq!udE1Opl>*95Kp8+$@{b=s@Q)+#Sp8Sq
z1IF{@DIwv6y?`41Xtx0`@OUxKK783lV7;=pr}lVu9+ekfvo<c-{Q`G<9K~6rsn2vE
zAf4*L+yE{W7Zv7D%hZuEKnpZE=tx=l{kdIg{$dQ{O?3p?5{F@+On3vby?VJ%;dk^Z
z{h-%+h0Rxr$YXyFEGj*?83J308mXvRnLJ&22~d(1d16u1!^J+SAf~3KnpDlvLSR{0
zSc-Ak9}2#If7$$S)=vw8E9vU)hV%CJ4kzYw8`R$#N)dN*a=N5tSDdn2EPOOBpS5Gd
zD1zvknS7nI9SBdS8X%eq&pFDSDLE7nASz@i^)oE6Vlgx1>{J99pxjK&dV%N?N`2^u
zWKmUOPjMJt*tw<nOA{Zro(%CrFWT9;i`SC#`mC%V{5dJ$X?8B%{w1aK!>&247DT~z
zm)uvGPZj>It&O)`?^--r=JI~xg_7ns8t(V7>gwtm8X9Je9fNslJRch?P`r+iX>ypQ
zL4EnKQaw=LfCOW0^-<_%DhjONWsw^rM~r=L-e-9nCCUA;oTyY-Q?5{yh?w0Br9b#N
zp;|+bf>ltp;!YM(8@?tRE}ZK;hBM5Rfft94Z7_i~fLuHl6%DPm)dwzt&z*ysni^2-
zuwUO`P%t?kEzmYI@bV^)W{9k>uF9@IHa7C6dOx`Wl3Mi3m-jR@!I5MlYrDI>kJrn~
z@6vklv$UvIa{8Js^0WDZ)ibCNDvpphW*Y#koA&1Y6MWbU9dU1>h#12(0Q!tHA_&*1
zLq>YOY>8zK5OdAH-SJYC+8a=u4TW$>zVwIj-rh2BkVHaQ1yx>B!<(Cefq`g~Hm&6z
z26(?R*K?z(677hV76BMA>ddUkh06P$NHUAX`WR~YYz<rfBBlPNCU-n`Gt6(_zDXp1
zj0ATc4Bf`b$?C_jL0GH>*QdHxQIKW1+3gj1!iD2Koe7NI7RK}64If{ZaZA-lsBxPU
zk=JQ}+?FQ2tJsunb2j!7;Y%h}r9vD8z9>nmVD*uC5inRG0PK&)9)GxV$%fwd8Zl-i
zfh+VaDoRQ?Wo6~^iVDNK9VCq&#^1+DE%FLxI-4M6m>=djeF)KI!*_k`_7Sq+2}=J1
zroQ=&xEYk5j&!#k{IJX6ZL%eX&*q;e!}ZaxlKa+9J9&EO88WPCQQo(GFRK{MCla41
zdxKVH!EYDtOmfuc^XaUjD6>$He)TwjrL<<W+~W&;E4=}Up=`|tam{KE{O?ly{`4p+
z8upR81X@pZ*M%EIp(5(wHUW`JV-IWZP>9P@!D8XOC*)8iJzO)9#q4nXL!85@Wk)$R
z>Bqe)A%Jf#uud`KVhW(|ymwh)w+kDGKd5jlU&|Km8yy0MF1YX>pjt&LDr$XK33Qlo
zxYs>MmV7dpe#*!DJu^B2Zz1tj+_K!Gs<;Up&W?PE4P_2;@Ss}cAmZwlr*)Q<Gm_zn
z&4oKFSVspWpeMrpUoJnCHM)EQRsJpTP}i`bg+ZG#kF;&sujs8K<hO@AMNWhvS))`?
z@W?yTgtr-!vXi;{b@N9T{xmkg)h9uG08-o>>b$m}`XFcZ0}C&9Z7hos=Xn#v6_6x9
z)cO(B2^&fT&a{qJ)crGw4`33Vw6vV1wRYzhrm!zZOgGC*q(P`{45(*nv=^qV-=$p(
zl8((2iv1e`Us7UiE{z?A(grCwSx;qqx^3q-NN6@>dM^(t>HJhe_MR@j*ek47MW*lO
zL`9AfGnYsl-WFqHNJXVF?JC=LPmh#1+>JQa2nH!VA7k;)pRSmFaXqg1*bFB++ob$p
zf-uNYaEYPJJjDRXIFax-MyE`s<G6M+5p>5tGF-4Ku>=|^6)8xOx~-L_YeH^PXrO@o
z5viUDO^fuPETl^uHQ5;N3)^YGtNwB1jU-w!UTUL(^8I14hnhy;!-Y;P;TFug=95wA
z)X)h7KXe^o=m6HqNeVAW7F{UbLlca#k7wZ&;iJvIXqz5P%{+f40Utr$P^tjZa-iA|
zP!m?ti?D(r`}k<}-|#<~b(e7ik{u1h$8`Hp!bppGz0LVYD!p{rpyFk;EFO8?Z;{W5
z=Vzr(Fi_T{K+n)y%HAojy||uBBBiE)2;4V6pfv&Xkw8d&Z;zmh3@T~tqP^aSnUyUz
zyODzAF77(+nNqzkjA||)ZNLKlPtC5fhTQ|dI+(y*iY1L~_Ug}a%X~k8d<s$@tR}O-
zfxTQ7MU)m%l6ceFDJ255j?@Q@`XdD0J?Qhzhx6#tBho|)WaH82<PJY76_r)cLuWkS
z8J>aX>MPt8sZ0l^5XLrD>&l31+OPjlc~Cg{#^Psj!<HBX-ujC>NgTd<*+6V@gJ3)x
z^bo04*6AItcxon%6-I(w=$qC6OPlSxvZd30G(neLGDA(^l9V8h#JX8rTa78adMTTN
zJDQcgfAn^@-aOU-X>}0s%(>vG{(c;stJ5ZjMXWXj=$6Wz-Fk2w6vd~qd+L1cGZl*A
z&>8~VT`Pzf^JV7)sucu2yzCqV{`m|UcFG>|pq{xV-tGETjiu?FLl?5H;(st{B=vTZ
z%-8O8IrPQ1uv@@50Xi>OQuoPPWHS7NZ!H2Cy|7y2=3lSL-kQzFJ1sGob)&`Y?o9^@
zue&P=uzMYL!=6pdNQml(spm|as+_o>)Zend2LweBn(dfQZ>Yj!=uh+Kt;PA-fN6;1
z%N3vHh;loT^1dKnRBs@Ymwp7wmPge&rp)NXr})$-OJ3LQ)nX{=q$quiwe<<#d?7I(
zsS%c!h)c>-uK0hv04a*1GjgW=)(i~nF=#b<8rI&aO$A6W&ktTCEeF!dT#=}2b^uL=
zQ~(kcNN<{t_@h8vn6vF*58ayz^?qxOqENZ{Oz{-m7t|33j>@6E^F)P(9lT^R?cABP
zdUI5|oT|9_UMN68)q~9qND=-w03cNqpaZb>%jpBD5R}Cg?qy^E@vYM#ni2#!Mr#aC
zcDAp(uryOH%>WWV@h1PadvD7(rjlWGQ^$kKMg5ntI#C%VBa?fTgASV`i=fMWK0e*z
z;!+40q5S*xk9JB^V+gtk&_lS{U#n1#C~msD+XEo(WH_1GDqx>$TijXukRV20%^@8V
zbX8<Q<{-E^=77A9EeS7JGFhst&0SJF&uDHcsWRzJ5VpX+hz+u-NQ4t55Lc7aStuYK
zb`%9N;JT{t-HuYwK}_quozP>MkAkA?tCAXN;;S&oY7txmDqy19#_T9Wx_~CAeW<cV
zlUW8?+?(%<0<@p7td6S$3u1bB0g;%-XXr&HYqSJ3=PHK#7~mR#@+A=(NZ#-_n9Oeq
z=;2qXxqoY19s-2b&8MTmiEuKj?^-#!tF`X{m9;78^#<d;(~n46B3wzy%Sx^~%3@Zi
zaG_1c4RdnE7@&#~mk*Yay=yOJcXCjd8lOv{FzeE(Y~2JxEFah1tENV(@By47ct}R1
za1sp!j`Gs{TS}PHrqG%Z*X*7OH#M#|3LFTx=-6l?R*~R3kh6S|6KK1uGT8bR9NuFO
zLc=lKaDKArLyYWREBAC^Bws&8z=}l<8N=@T;leM!3KyWjr|>7HZ`*qzBL42xiaL-6
zfx1!RL}(krn_{8L&-IryB_G_FUHqvlZ>J%+4-P%iqjzU*KdJ%66=&xYnbSPeSqxI6
z2zp#3L&jxBQ4Om&#f1<E!9d1&y7ei{!V*M+9@vxJ?BFWSh)c8Zf~K%O(~iTFYa0Vo
z75H#B`uXsb?c0(oogc`d6u<Wbd>~fA`^o84jkh|+ud6I2Wb?iOPKQt+F2{*ZwpV6S
z0mAUeJx%+^2c_HTv#gGy1p>)~m%^c_pb8P#lv#1nx}NYToYGMmiKAr@=F*WJt{gYV
zPe1LjA?!P)t$OdH<P(X#8{gVX3R7zT-t9gzS%RkeDhnF%$wa2c644VFGzI*f&Q_~r
zQ()qs=VhBiC1t_In-WG4)V<1|LJr@XdP}109y1<Tk3N^pe9XIm^_5&*x;zsU_A(K&
zG;MC%zoyZc%P0X8#+9Pj1El=eDNKz-mV7H@agOF10qNyY{IAN*%arD(ekOm*p)UEp
zLc|^^HxxfOIDkh$(Cd$-pXTLYVL=y`Eq1}en)H}+XWhnB=C);s&_>RLRj{^UpzGc_
z8jR;DB!YRq-F*j?M<uRE$Ed+N0{?(0`hE(<!Yn?Oy4dPWiA2lXJOD&A?l1T2TEFpP
zet^EGi5%+G*xqD80=qeON#hy9n>TNK{ry=iW|-vUiusCKJ3IX<D=YCCd3YM;xpQ(&
zPh&A~!)%=IDJoCxlz75(;iTWfzbmvxgh_oi;~ec~nM5+^1Q)5@^k`K^pZ*Y0`#83$
zl7^*k{KxCxR0!6*Hm3B4(LEofdcdLHOTqN5x>#~QW;X1@%$7>|a({K0AmGjK<>f`E
zQG1f84NB{!r6qQ)8h0sMF^Cn~a))v#t|ZCKmrb_xW~$(}=dhhWP&g7x;HoFg3pccp
zOVm%J81@rHYX8yM=@`FtnyIm;A1?l!-u#(P9|{x|QNDc6x--t0Jn7Zv+gaQ11C9p@
zp^m=hO2G+MP<#K*%=Epvae0kGtg-$GpaKRFk>_AZe}8}b@-m6Gwl*Xb_miMtn`N53
zM!ADJI{D^nP+P!Ttrj~G%0{r^`#F>BekqAP0&G88HCyZi;w!W6a_({z-F5=hnRr(e
z#}_LVgOu>gZgQJKQ~&J6EmHeGNScYBm++%w*$=VuSfxNWa)4P3jEv;8w8G=!l*6u@
z8XHBWrCau<E6u0FdwN8tiZ%Q56$?r$Dl)WNya@T+E@Fq@$Y^LtNJ&YB#j*F%(`=p}
z_I@oSs>BAeHh<DcRJGQS!3H*~ttlPOx%fo5p<-s4KR5BtYQOkr+NrT-pwmtr@@Lq+
zr~NRVMUV_C5@>u_%{6{bAWVJmhy!;wJyrCR3&zu3$R86nt@F6g$4gCkoK}RI4Nl~2
zY|-rIQ>eJOJt8EY4*lDIxvYnA+BiBEdEMULF0SP;TLYdR$a5Kr`XWbG$ByfasMxKO
zOa48*t-vv1yQaY}A9tLYtH4ogO$9MPN>Vd=`&LJf#2Y46y&`F6y*##>RFRQ1+O71{
zt^3Dv2RSBWH+uC5!?Q8Ya>gRm%TIuHIJvlhx<Hui;vyq$JK>kBOxW<h`}h=yz*}D?
z1Frx%IKOu>iHUAeX<ZL7(Rleel9D7N2w`+4<uv@oS5PG;BqjW2BZRkt=3BX)tJwl-
zW?rZps8gOHPXBRB@y)!cKBiM#pM{Vh2`u@E49#KCTEGF{?-*U`e2Njeb@degI}|o|
zM9vr$)zf>=&(B}uDD^+!w{Q?8D2FRaJ~S?bKBJx!pfj0I0ELNDyv~?5opD*Ad#-VM
z&f+^0YKnw}?2VaWF1cqCkkT*o7HNpPb3?P7(kBS~BzvwU24Efa=nvB+DOKtvL0XcH
z1D>EPVE-vK=tY13{{2UnlXn%y!xZG?et4`#Fig5_e*uy}-SeTKq`8l=<)IDO2}C6%
z4Emy}KTuF8k!UB`2h-xcg<xA2&;0VRYl)rNtE8bsH+y5cI~1Vb@SEzSGaQruX+3@}
zl1@ueA1Ul%OC~HOEVdKF4Si>`V>Ve2B;ucn@61j2)g9&W^57NEWqvR+zBNVsV=h3v
zlEY^Tp4Mi!Gl8tb0NiBlx98ighgH*};^G;AaVykrA@+U_W+wuO*My-#ysxisEr>Gp
z9UmVN0GjpA$3t6#Ng!0^U2=8CJbjFAEeqvMwshp3E2x{Z_3eX(?8$rXGjD-|?Zq&W
z6T7nYNS%ER<@tA+{kdW`j-{l4@<pKw+){xi*u#zUyr$BMNH~MbO}PNg-`NR1YLTg$
z9RSD=Xr%-OM5QjhAO^6Z?SLARU~D)ATz%khgA+ARlO4fd$&?7(KhQFq5yd>miBR|b
z)}u$@E7|T*GfZr0__ZOYF_T>W2iqOGC^F}<&gqyc>1$=4@JXKJ)c*D~NR+%n*9x*0
z9U+dd(v-q<h?n(KE1>P0z4J*HAMo>ty7OZQLV2s~a0HH}yJw}km09mbhUU+Xy$d6&
z7T5E|D4yUe%2;<_@<cJFPn*Rrl=y)Kc;l)os!qH!T6hK_{q|28rTPJnoSELTDMb}%
z7H&oU+9=7J+UD22YIANB)6I9FYQEKPKwD@FTS#YVd%yOP4CIDddXTLvPNCV_2j#*9
zf`P*Wtc~(O_W?qlA1I}%>FAB^3<!aKIT>zf8#<=izHxzYqT{uxHaA?8Q~GYN790*a
z$2i%#3rWFAwYh&ub3HvFy4hFdwoRxtK%Upd4maF5n{Tb4LXe0G+v!xCF=40kXoO2i
zs<R_)@qL5F^D#~x5b_~%iwd9`knC$p@w}N1M22a6?AL9cGxeNDo2<z%I%bnYOALh|
zyTwW(xBiP_B%(aQ91NjzI8ik;9tE75$>faKCR>{V*W6wpJYsfT11wO-5-{<^wQLjS
z%IN$pcy#NbRTJ>5z%)J_YW-W^_Uj2{uO+4%g&hPX6NXk4h0kOn4&1Rs+RfH#1tw9v
z`px9JJ1Y!WXZ)|bF`T@D8shrK>d<j4{>$^<y7Fd~+j^$_2-}sM5BWT2=Y1VqRmm!_
z+-apMu5QerdB3-Xs%*A>*k&AKZtA_1HdL^0pshw~6${F-4f;i=pTIFeO24=}x*&P@
zvF&Z;g~NyUbhz#W^y=Y8APBl|y#6hvP@Eyh<yXrcmaRu`X+ki(S5*meNwt&)K2co3
zGUlb}W~;k?eL7VsFGYf1k^y}YDkpFz=#GI3T(yqu)mJ=N!^2;>O;%q2#58aotesPg
zmKXyj!ws$sqB|~j9r3Zl5!3bMafY_zm_Qy$a4nC$a_X(JzJoZJ_6Y<i7iOgZy`wou
zAj|GO=|BJ&d!}a@nP1}?%gd%@T+A6EF!q6k(c(A2)PVQZPqteXJ0%l%qn-15a~H6Z
zSb&HP{4ML-R?=s;FF{!z32J<6dBxA<byt$J8&CAE>E)*j;>!o5U!dWQ7VK-3qJo-K
zM|BjiG=NZE$$?q35}E47>n!~a=2@#I4at2RFx<`g^`U*v>AU7@7Tx7ySmvhJ$$ds!
zeVSnO9JmI)BNmOkl|&g0>{$ak2H?{Nd@)4;{79sN@f&!i*(BQ6b%G?tTy8F3wGPjE
zb&*o!+Nf+F3b~oz6Vj8`lHvKO`qR{Yi~Ls^20#aT))|K5k@k%bgIcQM?jfOPOdS>`
zYzXJXdtaWXQh}7jO3f0xaSlxt`#_?P|KQWA(gd{XC;HL-4LPhDAkk?NOa}eO|2nD?
ziVSV#6ZId6>LZ;)$>TKMC88q8YLaG-%eFFj2U-TnI3;%f+$?aS_;yy-1&y@cV30w#
zcbd$3w(Y~C+X@{e%eigYI5S5kG&a5(F~j8dsO0(dWCg6W%*^j&fv?7_n}v2V<Aa^N
zh0>A5cBi+9^UIYefryIt6k?OSTRUTa@--f|+GdCRd&F9A>Sv=In!i#*r(h1G(|AH4
zMCZiNXHZ_4ev2xS+7xP&o0&CvlQt8j6h<W0&+y*Dln%ITz<cn>p(du4kq#z!I(Mqg
zJ*zjQrCnzgB=<D-w4<I+baaLrMt!!iFV(%yvq-%Y1iNet6`rFo+|OPH<*DvC;!|8%
z{ss|N7oTfKm4nNdVyK#sE2$fllL+E{AOvCOC-#Zv2%ldZXcoD$WrB2Ap-qA6$t!;-
zYLsE-D^u)wKAsP@Nw*aY5bA=AYhLnc1u4p}HW_-==<+pqZa|g_OR-dxBBP5eNrH`x
z611rvFJrxZKQ0iFQ!5eNMiA-sVM`Kh6Uk(xJI(Z%lzMg4Ka3y?V%*d)g8f8B9RZ1e
zS{<Who9R=NRSTO6`Wg##8g?J~;(#^h7uAVSu7wceo-}Yp0V1Po3DUnswjF<I_}-8u
zW~ZLP5ie$5&pvD1>}F^52+*gWyu`S4#n(SZrvjENic%gjxV%Wz=yJVUOoc!fAr|{u
zTu=i7uCg3R)haw63!^ycd5NIo*Zk4KllFf>GoyEdayGhVK<H1E*E|IDX<^4=*IQvd
z40MJIKUIFRWYIrgiUw%*Gu6AZYLPhRwaO{Byd*6I6?&UNaxaa6<r>~EJu){^d_j|>
zE{_`%VfCkyb?MW&2si_pQT86HpiP^~Ni;lcK^|*kVE%CR=tbVGcPA=`%h|-8o==PO
zuZCS2L<+d^aEJ0DGHKA~`dageCA5mB!JyRNvCHv<m%jhZ<H~y6$^hj?PM^BSXee1E
z$5h(OYyV@w+_>$y3x3$H@)(p<n=^q^6LJRJh698Orz^K<0tf5B-UO-&K~3m#|In1_
z%b&r7F?#!R^wa1G@!PY_Dp{fzHzyJLbj21{9~hXGq>`iD_4-F=W@CqQuF~@gh$x3P
zSxGgj_39GE3C8;;t*Nze$70`jJ}~3!4ka#H@Y*N=Y?%vPY;+|H>GWTj$xcf{Z%x+o
zz@f~dxG-%+{5<#aWcd<KweB-mYpLu()ohS9ZavHxp(Y~He%*4570;}EbL~aUbY#Fs
zw}?1xd^;KhqNXblE-m<I)XU)a0o0e#{J(ne%$L+Z2#SNi6}JmZ%e~Niw#W}0;rYou
z4jO8z)XN#M6d>~SQ(m|IOY>0(cgf1U7uJp08?~*wpbrM-spH=aZ;Ka#4foDI&Zfz$
zKxv)5=5PUF5Gn@oXIUFVB0yY-k;n9l3#cZ8LOHEFClu}OG&#=bpdV<0{Zqg55c_j3
z+GdqR4tfzKf)>PdgARt@#A)7x=(e^~iO=s=$m3l34?L%i<eBwaYXs_mi452UMNquZ
zEy<NBnVXr2TU6SKAzVUd4ROlV(mrdOZn$Om6x^0CiypRsp<W2hJsjkJfjq!WImGJd
zN+_Hqhp7}6-d|;%02EY^e)NEW30MUZ2=PQT#$HwFzZuopIJ*B|<?Rq0MhZ4IOioVD
zg@uJ$ATtJqg%#OK0Jm&#`h?5n(PBfhfWO$Sp*MH$HloRfPJ6tL00{81(3{;<LKbmt
z7Cs6;Tr_KWnX>bi<%=Gc8=JHK?1m2ErCK!+&v4}?Pf;@!8?GVR@2rZ7(Q*?H)Zdo8
z)Kt`zbnjzt45$(O9DjT-ja4eq8Z1<;aM_Ge78MnJ2Y`}pyFb)|Jh`5cp6+rnDsp>1
zED#hF^xO4(>n$;F{CJM}k_<8tcQgRqd((H9`vhz2@qYn{vWfcOe?;34LXtovyUUg0
z#a|0(g|ASiEVsJGtJ2RCQ9D)-d*|)^o-$zN*L3-&3B|Z_hq|LIl{}=#2h`@nUDugF
zQ#ljwOTb0zhH^FC=;=#9SQH0ZH?e%Sgznx%UZwSNDk#R+x3|07{18jr9{OTv6S(Xt
z7V8}|0NA6`YQ*(v{d3&%thN3Tz!>1hFVH?ZIzl?Nl@J^VItQ8|fC2tNo)3W0>rgah
zLwF`yOb~Iq(_?BYoD2ds%4{b(MGdp260^6Xe*=3A1{!u~Q&_B|$k6G=HjrFYt!3fu
z4%K1Zn_^FbS}%}oC`)QXgI3G_^;!IjjiaNZQAtRKS6Y3{8wLJt6)|@V2V&?Kaq$wW
zoAmT_ld;S<P&F)xBP=vDbU!r*h>*$2$@q-+m;cnW5f>Nj?Y&ByY=Rm|z|6A+tLai2
zz_<xj4!@N4?Ph=-_XKuuRN^TN;8^n}=I_SnFS5=YF^PVvpX$;oAqPsw59}k*V8HCv
zi|o{wD5%>C0GHpufJ9$^|LNIT{%Lbu91dt0sp9VLXKaxG-8RruSg=gMVS!7lT)Yt>
z@U#LtfXEpb!g-Dxqt@2e%u}~P|Hz-o$*Q}}ZR6DaBjn^>-SbF$6U_KIfxizw3NFYf
zx}W8z+f3~VDRpal_SI&w^hL`SOw`0sqG2$GVfxdDkq-x7Q8yM63z6ZpOah4?aD(wb
zVq=7LUFb%;4a5<26Efd5zVTip0Z!~Gv*AW??8?k0X__Ce)B)p^zf8hv^cFZmasUPk
zCFJ^vM8ILQ(Szo7x0_?0YJRj(XJBls+#)AGXdh82oJ2c?jC?#6y+sU!TeNg1P&C$%
z?8z;fY-L$rq1PQ|&XwUy0PI&sUCr&O?q1$le!p{tfrCrZ>~qj2SEbJm-@XZG`CWA^
z5p|+|3cvpt*|zuVH|Hal#&&OX!@-iil9raPR6E4V_=bmvryPp^Gzpfg{rTxy!0Yi+
zx!z&dZc*RU!{g-odV6OyW16=Mw1D8?#DSfTa{K+2<XWMyHIe^s2%Mkms0rp1Ydwbi
zea&Ah%lpWszy#QrhYz{L7gUsoullRT#u(_8wR7`7e*EkNCP8~r;EYnaJSu-7n80A^
zcW|i|D{;`fcu31p;tjk?94foqcDi-Y5jJT_16vK~!YBkuD~w=wzUG*{<p?xxZCy@l
zZ*Ql0XB!Wvay~c@r3pxznbGj`$2M`f17RCD6@9BuGv!&8LFBO=2V_Ic>Y?k^&=gGL
z<vHonyh~=68kzV9d5Exb3&i7A$eNe%@<1S}eP&p%r-xdON2!Trsu11}D}N@Z6ppO^
zMn=tQV^{HwN1Bf3evjWK@QK834J5SxBo_}13|s-XWgwL=DLNX{XdwQ@AKy0^@jT8}
zm#61jgX<*EH*jcZXq6_T-!Cp4#>U5c`}<+vF)~Jg!KZ>Qub)4E1_lS~ZS+LCxVhOq
zKVBD{;Ssd?J)k!iYB;ld{!I&jq3u4Vu<p=Gb>qiMdKi)?uV^5K2e#_W3X}Hml~tJ^
zuea^(dIBiE*)z*atcv-0&+lcG{v-hzi=o0-K)(V9+_{WQC->Q`Ir=TA5<m*c#ekBN
z@yz=1fD|lgy21xBNFNOMKX%{{T~1C;RJ?)BPp9?!D-aV-Mn0pguCL2Kx&c24%!pal
z!D2(4Yz71kFjE~t7;oOb1=I$#)lZRM|Fa5O6qHW}KE0jkxzEtsRSj_e`Eeu>OyG>u
zPcRB<YS!H3d@o!>q~Ey{fc4jwJPs*I;sgB|E;_nwa6|Fd)BF2%9-yxD*w3D%8gMT~
z1cpdpqY-FcHpZ&>?7JD2^2fSuAb}Oc&n?Ag1{#|DuC%D1Ka1TN+}+*1K&Kn`7b9X&
zyQ*SG{iP6!+$Wh;imCgkwzj9!j;MXO8qmQC#B`{NkZeJ}FH-m^J2beT?b>D~iSnKW
zRPLX5>q1pA+tc5Cy0F~LbE@Pf*MxL#vf=vy6{-V(Pq-G6&GzL8JBFJ0X%R=EoETed
z-PDafETpZ18wJpKHnOnzO*aK9y$Ge$w=^(<!uaD(l?7K~Z$_Hsw#HFa%@1agxWv95
z$+Fy)O}%cge>;k^i2`)*!_%%*l-iLJSZJQU$YL5TWAp%Lj$p<gP;jMc)b;OO?O?_N
zcscs6uMf$ix=xSm4%oFesS`^@K`U8>nVN7raNAe7?u?uK9NsP<Dj52g64zA&PGJ>L
z*@ID012^?Wz9wCW6gcQ9d0d7H3|_XqwM$QQF1+yndb#jw4B%8>#V)%ls}R@G;vnfg
zHakdZ(m|*G`I8qA@zR$ZKnujQ^i}8ZV(P~EtA&<#iM5%CX5IN1ie%(=;&e}az<b#p
zaha>7epym=?`paDqL2PpuN?4`+MR~9;Li0X4TH+3Q>uFiP=lpRiBSo$=kH`x;Hcq+
zn^^I~Jk;u>)m4_UXZEW=;hFALb>Xl9HfIdD4sg!IT|7>ap?KJ-aGrw9cg?Lb#=Jla
zS8mjmQ*T}jG`5IYMY>w+Fd}{K2m{8}?E*YcsAoFJ{x<Z<fPlGs7?=+N)qo9gqM@jp
zh3iMG56F|C#|XM#J|+OpChmIX=Zx}QPg()Wspoxl7Ldx;)Ib=?<54>Y1b1o0=t=kQ
zx5R2DOY?FXPeioS&9$-BOG6&d4gydGv!OKlbd^U%vu-dJHCqZnQ1%$xG9N$)jt&$~
zsGZ5z<Rj3$(s-wQa*lIn{&;<wur(@P5dI{V(AR4vsmM%w-LVGocRtk}yx?cEt=>GD
z)qwU<+!&lEb6YlA;NaC=bzXumtoKv8u>t_y_ZFgcXT<pY*vpbFh1d_6lU!wYX~T+3
z6$-VmHh_Wunc27rW-7^8EKqTSwF*|ZA9r~xnf#;Aw>b6eR7-@h<l*f{J8RGcco!p(
z`5AysadmXq>BO+`HY^@@vk!-}73?S=D4F+<IAG|O?0!=BuVv1LGMUPu2D??6*musg
z-q&+<P7i(pxj%Os*ibx}i*SG|0XcJ%X)S{u5OsxfWmdt3q?nxjP!nd_LHmzzEvVz^
zcD||c=F+OdY*;%3$S4o#*dX9dEq()N$l1aXA2hP*cZjIZI&-TZnh0vdVEU#neMlQN
zeq0KE%mw1(FFSnFiRlVI9s&OOw4*_SNWlDU^rSRi@9S11NPPfTw+NcBZ(L9TOBw?C
z#?B*sI+8PLR8d?CoUD^_@b(VCjO@Wv!8o``1l0=Qupt#j#`z#n)_#Y+8QrWMx=)#&
zb@dfLRKafh0C<avyC8mv9r;n6-FAU+W@?u3bJy&aAkRtBAhsz~PVtO96OIZGJ4UG4
zizlT$FrqrKlj?R;6S6sm!&#s&uRO)Kn8q{D{yTHv4=@Mn9@V&0(J`mV+Uec`3HS0n
zYf0@L^-SouA_XZjU(?ZjwnmP8jN^!1(7ftDC^E!+;X@BRJ5q-x(~6Gzbex@QN>C4M
zz46`n<a7)u0w_d|HkQq=lP)mjd?fI&YI;E{5;=z38EA^4f@<`e%@H+rO!OjPL1Ca9
zSkC*e*6pdeOSA$+7gC9_$3@TEr#vrU2<fO>&;w%5SsDq7jD}GnXSd>YC%EHQA{CIU
z!i3R)(q?*?z)&f@3(EPA$*BcarRKS*H1s&lwMfA3!s=Y5d%J-On*8k8$RcD(tjYBD
z#b)>Aa-sKP`z)j%^vAq~#%K&h(&D0&8pQq7UF{~3?Vm-Ib<a^I;i>FRN$cX0h8yl`
z3&iRvvEIwmky*N}|5@WbNvsWBC4TN6PO3UC79<I(1deco-QHUNUYIO?15_lf`%@Tz
z@q2pc<#-ZH^}tlQYf5P_IACC~qo6+*fSUojT54|GEVUUm0Ie~(0$oRoWHAWP?Q2@I
zehxPQ>bKB-pI8DbkMFPbIy$T-{AL?K?=8oJC5+D;M|SPNpXbNYwC-37N@^f?n@r}5
zbZveL%q0VT(R2wjQP7m3l%or{tA+C=BLEM{GyDXxpaW4<vwG*TF9wtC_xjgl52FmJ
z*?uar*26=M4Cn-l2aYi7XS;Z?V5|vBn_~))iGZ%5YC0sutz|ME9l(a&2O++F)L{+b
zOk)Z{Zj38-ceID^2OKiSU8T+<>*MakfFGC|R6WFq>$2TTcXe37g60m{9Ipj?|8Z46
zu!OF^L|8$Ir0bMeJA;OXJ#epIKi1Rg548Z^84y5if*>a4&%P~3`JGD{oQ^tGLJk^d
z{s!I4ps)Mp_|69!x?zncEE{!m&9Xm4L4^6>FYLDN6CPbmO?WQqgDLbsFD*-KR7XW8
zjJ!qDodt*6vCD+5++_R>48^5+&_80kU_7O;Y%R!>ErWPHz&!+QG^yYH=-kv01e(or
zIkH5rNHH&q>H=VatEW>P#i<MM^}zcA@u>9k_G*a;xh2mx$bYhmLe!qX0G@>?AGNYo
z>))X8Bm)j#?-HIr;QoO=N>|W2MEec|M^;&pAzLaw9FeA2(1Z+vqwnlFFc?I2yPeG5
zyz^K*0PNaYg5sOdTZ;zn3<}U?kN-8DD)fz&*5ltDI=)!*QFZH!RDh|lI+2aqJ5<8I
z(8ij|Rn(_c5*Q7b7IAI^8XUBSc(mY=_AUznYi-tzfh_^n-uR!siwx31TgOsat}xVC
zeH#&LuO!wVT-Sf;8E24QZM^dLuJlvYrugB}yqah&YL7Jfqfz7FW}}R_FmWmg&>{v{
z(PzM4$b9oS_)xno;p0fg-sgil`W2KXAcqCHd7-Ymaf)R8)0DWLLZ$V53bTD4JQ+?!
z_X7koDFFA%9y?w+;QLYkhUNv|PT&JPJJ7Ved93>sy0hjp3cpO5x?1ead3t?(^)puU
zSvtZnQz-YhG%bpkN47vo*j`{mMk+?aE$^f7OEUhsKG4O)F@tHQ`k4y<XHrJ$bLi0k
zG!XIpN%>hGob3s__1Np)LQDa*dIh$92j06qp~;;U&@^&lTcr$Lg96}(=l3dv6W||D
z1C3nrMoecw1A9GiAGJMoP&_@@gapo?OpsK8E@;=ZaMMMtI7aU1RqFm$NeGQ|ODHvG
zo^88s^XXand7%z?SY|sj6ETY8ZXf`|1G(d;w%kyFH(|yxLCg7a8Zui?```aW*_uNG
zBYvR%{L^<2>TF#i;Nv6*0+u=G4*>mkS(}jNS;t_~b52%MQ@W(^<brc*M;1q^+(AIt
zwk$6CL%S(2Uc3OM9ic?OE&z|On}|Bs1}ST0x(AIz&)x~;O2}fq{)@v1bk^KBXrRow
zaXovxUz1=eE*`&lXevD@Un&3E!+`jGB?47hz7_uAn*c)cNck3rNuX2GRuICgI?k%f
z_as=JRyDSXZ{*Dj@x+r~OPE)}iNe<X;yoq1wT)5u>y?s>j<c-Jx5ov6+SG+#FNLsY
zhRyw{Ted3#juWnJr;ojcxwIG9oz_R{Xy_K)v4gt5$Y_iYnB$gz!ceb=wYcDkw?`8C
z!{Y5MA>T*8NYqu>htXUt`lt^7#j0{eXGk%bVSMdi6O!Dt055Hfm%b(05%NjvGmHJO
zG@c=uUr5GZ)&i_V)^w@g$#C**Z@lwm=EwXsbDnUXFNgaBM{-MzmV|hBIf*$?xT+`@
zc<xUYjShYX3caVBc6q1gk07APvG(_sVsgur@5P#0CeC!5HhFg<Wn-CHHKjoRHzfFB
zROG+Ed_nN=^whPsjsOnaWFF@u>HYDsF;rAkm^_8xuU}yZ1-uq4c+g0xso$7Y;j<*X
z1tb;3)%7-m*Ozo!$3#SJR1)UzgSB4lBP9JP^D80aOd+_n0dKml{@+n628<9(aRUCn
z!j1<%V^qg?B-&e=-SnRKzr5jwQOhGCt2zOkaT1@7VzWfObSc<UrI?(OP8Jy*U3M!F
zRZ6elI(aC6ODo|-a_iEFcU(PB4MTk&2`^4;FS55RXtOqW>*5tonW;yhF@}Hg)N3~|
zpW>(7@_;jjV13r)U#KnAao+!EcQC`;(b4gaocz`K_Hg$%SU6}${qOYjm%ydbw0C}f
zUS_i@`0#k_-3?f+i1qG>^``5k&sJ6pK#G3><9>bA34T1uNhglB&RnJGOr1R?R7$-)
zGwj)XOfntczm%cE9NBfSv9rS;(?5#vez18%Euh({VQrk*MBEoQ!Neh0>Fl)bpvo7H
zz3}%Z#i$Vm^cVIN-1@DUoW4@F?S$6GqMeWGtZcgmT+KZV;kD%QCuwLAeEG$!S2BHO
zjvv)zd@#rzO%8X^=VGMfd?k~<j`Ui=C-4$~p}hAxJT}v(jeT_HNIvMSjmJFAtsk8E
zJi55wak`cy`Z$fG;kKe>$D@vxdO;8Y{#(3&A+J#`Rw)rVXNTI|a_y8aobMwhA>se&
z;>yFJY}@xp8DvR>j3v7)Swcx!X6!VtWnNo|h7d`}I;F)PuM~<bqaj<?vBa2)vVCnT
z`^bx|4M~}?>$|+YzdwH89F92-&oj^cJooin_jO<QeVyl-+!Q=>?$`Dfng;R&J9{fK
z^71QXsYyvU-Ksk0J{89{OufGUvb}weX{z`hKEn!oX-P>IpyaS1LPA0eN^$1^SP>4D
z@bx}b%WW^;g;})Ok5sj~uyF&cLsI8U;utGOJJx4RKAqYmyWF-bVkngPs?+2S(A>da
z-dHQ`BZi`{$MbK>9d8@0TsgF~#qYJM@mvITA8UB-I4}E9SO4fxO3r^wV7N$?PY81H
za`CXZ|JZH%=0}hTT(_$i8*U}r8E$G`rK(+}SvOIdT01`P9)EYGE&k^|VU(H_daA-c
zN3o)Ft~L@kdO?eJq)`Z8*?9Z=H3?#9n~@OG?3MK5BSZ0hJQC01t0j_6q&fHQt+_(C
zCo{i)51gEuVytCvZW8>5>$;z)NJ;Gi$PX(5_vfF6<$Y2oHye3{V5n4AtL^+S%D~6G
zbdMJFz4g*f>Abwkx{xqoe6;(ca#lCj*?0$`^#ZqnIOxlyEA>`Od03Cn?2FhkIhSyE
zK|aJthdD7cvlZjPD}?0eFmDGVG<!>*<#=Jj5~X`4=DoEfA&yhb_4D!Q3XTYe0O`fA
zV(EQzf%QvADz!)3mnMdhZg|#uGFRR^4XZnrlJ}Ers_0E`vfoE)i$GL3`S|V^^$gL6
zd3kY4+!e=H{N6`xula?!`z)eEy`*cEP!i$X#B>^F!Jn5mmJ6$)+qKXhn4TcXM;n_%
zgjrpl@it2KI(cY5p4NDYM4_N8iq?pxr8snXA9GEl!hWNevGR-NaQey>h6J;SL}C>a
zd#++pHbCEqK7{ES^lI~-DMKKtRDMLwK4c!mzkG?2#PpuLWc2nY8{fDEM(J8pDJCM9
z=fIUnTrb_jXR&R;UOJ#r<nq$v$^kvkt8c_LwC|d?H?Tj{X7TtWhnQ_@sjPm?F;#zR
zh7ge}u625)XVkZ?zJv0#KfG~3Z?(mn^g&?d@`DLIo?m)@3w@SDL*qsDC(;MzY;1}<
zWh=xNa^-v&3q4<HEOpDn@%L{hBV+KQyzI!amHTP?KID%c9mrq5@$ION&Or&ncf)AP
zrWq{r_sCm(t-kPAaS^3W7_~diET+^#=dt3{YNLTF;?$yHa|5=wp{}5ziJQ?JDm<*~
zF*}R)ixQe&@Q-!wFj0!VC6>9aRMY#bQPqp4ODX89Csh`4A3HPL6EcuM5pxB%DR?`W
zG9~g%<z`AigSHfSc!#p8cfngvsgWi>BUek*SLbifXTruSYi33k=e1jDeXTdC=U?7{
zaXdCX%?6Y%H}VZ`Zf@%N)o*sCxM=4pTLz)pr`|a{Jbc+NAE2AGOVaA<HmhV9*|5Ka
z3^l+QNHrLSRP#Q%bv14)>}ZF$Sen^xZzM;Kn9PF$d&g$oWNs9=pH)!sE16@amSM3q
z9(cTT+wLX-{z==DdLP!R&c5g<ObbKKvCE{H0ir4aHk8Rs_ySNKb1fU_Np1|-2Srn>
zs#I}`8BQsneiItXl=#L!+w{JtySsZnUcEyYpRC@&cFUW6<FL9qpSrp_APLcCX6!uT
zClj)>MP7Au>_YTcoI%3Q>Ubgv)TkWmeE68(KHr|ni{g>T<`~%93kC%Tvn&(#=HIz<
z9aw>QC=5I}aYZq7ag-a0M7At{v=r3E{=U%z#u$r-zOk|K_`6))!5TdBE1h2O!6)!*
z@8Q+eWfu)6J3Cg?!@uQRFc?f)c6JhIM16sc@Ak>=dm!wh?!k>l41zG$@bK_=;8qcv
znhv(Mwsv)O&8~>VKtFam!&D0wYqP-?e`*~X0?^sW*lW{+JAE8{5co^ypo)r$wZdv1
z(=^eW-BjmRD=eBKs=0Q~3*%Gowz`k8j7xVMjUnZ9d!k)k#U_K+@{c$yDUAG@j`NnU
z;Pt*lJsw4GA3CG@ZZy(KBTDLKtNV553T=wYZSGTflA=@o{{CZAUPD@~tG>nkx5<re
zIaJ2Pe1#BPsGM=?Jr@tJc)-zrZILm-BKiBy$xer9g@uJOmSHzj+!`7h99zQGU!8ll
zJ~$l6{C}<JGnX)4|GY9`{2PlUS<=6qK3^1W(Dm<e`y}N5+2Vf>`}e>u=op>87q%34
zv$0K#GzDjV+aJXazVHClUnYZeKrv@$9&&j`7<%#;f(cY0u0MQ;5)~6$nkT=X03_9U
zE%-P#1<As(@#*Qf6x6XcA5ET_g6jD6oE$C~nlUlE3sT4ulnZ`-HK7a6$rAeFJUl!g
zHlq#UJ$At;26|f3ccIHuW2Zm_nmy+?d;$V?w0HzK_rM?Uk$OIU)EJ3xV<VQA?}e}z
z4<78Pad2~Uvl`#Y^T0~3AdW17f`p(tXvzYbf+)K~2R%=VSDg9y(KaSF)==(V>!(lr
zx}oz3eM(UJp$;}lEv*bFTg(DO6lY^&2%bDSJ8Rb0*O&QC1XY-jz<z?Ts+N(FVPkI2
zVQgY@x3Te9eM1AMR!vRK<lLNjR#p}chcmLwR4(Y0h|XzlZca!jDKEdRDZ;JdR2u_Q
zI|@gS-s`=f1*}A;uhoNy0|Ej)h35Bs`ky!nNJyyG1vBw>c6R)0tE)XbH6EcOF2u4j
zMF6Wkf(j42RupqyvdIMt0z5cpXA2-?PL7WoMpE5MoSqSMd^JU&QbYo(*+(DEx^wNE
zmZJOV#ksm+A6J*Y%xx?_FNUVpL9NYa<mqK7%IW=MA8hUkV_L&SQeB<qE`!cgiDNW9
z0UdhC30uXCx=6+aMHQ7IICb~+)G}~qd1#kwTfeYKqSEE_!f&3#<KGT%Zf*I}`zr*$
z@}Vw3Xxn`qNLz^sB}GNCU`GWltv^djD=K6`VQL+&Pbitebar&a!Ap<LCYh$*#VN|>
zVvn^TU$DJ#SL^95wL1*<X=`hXVUhIK54|{48!M>n_&uI>T8_UIr#MXZPVMO*9mR6S
z-qhP#wv&=7D=9gI2-)6@D3AlGIz&&A<$ZD2nITMKa&n=ATSWVlg;*gsV!hwMem-w~
z`>ksR$8kEvg@t#}1w<w@v_MWrCkx(2mepg^tB&)MFjV|s{7x|V#Ki;p#l^+{0(0(V
zBynryVvz^{>Fq%X2ns$TlRqhk)jM;im`G3N<>&LJn6TD6tMnQf84*dOm-G8IwKb@q
zi<gj`{L1c+Q(pb=hEiSS|Hw0fqK+G3$@nRSf(P-x;~}$ms;OmVoX!>f%2B`56!QE5
z&OMOa#%9as6ZS(t_4!Sz0l_^6{$%9kC5srlfA*RoB*1wQ(+3eb)Oz8v97vP-1O?f^
zK>s{JWhKu*)me$WTOPOM8bXkRv%wq8iy>iQm{MGJy;6n+%r`qdl9fUOdKAoyfT!Q7
zZQL=F3A}t+z5J?LScN@}-U9#JFp?S%#>ZHmK5gvbA$e_kZN`mue4~awT<<sdJX(K0
zKmXD@-9;jih=5vE3{J7V`3gN|em(MeRb?ex&pj%I5><7TLZJjRr3Zm7PW&`Cwz$X(
zMP7Iu{6M9vBl$cxB9Li+t|X}cLj40xU)!pyBY#iR>AJnH-k1brjDL@jRN7E}Vu=F^
zl?xTkz@-V}$?55BuZ6|nMY`x%Fy51L`3oD!{H3L(u|!2}Tzmc(Sz^mdjvuI_)ln+}
zh!-HnXbrIEt&J5tRC~~$&r<&mW|H35;5&B;k%TSA0Mvunfc^|qA)?+ng+U+KdnJc@
zMK~#EW4Kd-nwg?Z=vF|fLE(RJc$m4)tSTrd7&vC-zdTg!!HUg0nFuJ@w%4yMeH==0
zY9c8RrL@bV)xt6?`sgh?A<NraJ|89R2nG}p5rG+)y~dHN#xZ$*-k$)B<$br?tratJ
zwuFH&d;^wr<OWFTNsk}18T?U+*k0apE3zGjsg=t7JUG}jl<Izo$`EV__?*;N?tq6{
zTtZ@EE6}m;Kv2C-ZWP311fR`kfpLh4e6LrV77rGDA+w)j7b|O8R+evnURG8jNHfCg
zKcy^%c5xAf!rWzO#qE9=MEbUO@7^h|tglC=XJl|R#mB@DpFH7WFc|35CMIl!wuhmW
z_!)!2>ndj&xIkV};qYO;YhdPmVqz+r`ri85Z}at^5B$)8@c4aci3I^tmgpkeEU!zN
zB5Ap~7c;5YVvM2T4eQ|Xwmq{UC$_DGTLCnIM4^60v%y@=`5FuJm`6zv4+n=K7W>zb
zEe7%u>+(M85m_(kJF<D6FHN3ryv7p13wL5OfOU6bGq`$V$iUzs>h?Ol&{|=(IRDxs
z=)(96FjL6apSBEkmJj{^q|eE3*ox#1G&K2+<q6Q!M;oM2JVZpoXB=GF5`PasQ_
zb8>Q;DGPprHRbHU+32FGXX{C#qN1}vjzz!CI~8SCU~L^4c_Je-^JBJ_-#S5N3!oSZ
z5O{w4{OJp3Y*~K64zdD~KbDsTGIIf7FAq)w8l7b8mqV%CY`1QJ@MJ0A1`s^}Yd3;9
zpv~3;lNM}gx%Y5}g)@u=XxSESprfO+RLf7=4XZ2S)gRsRlV84I_9w;wyjn^5j~6EL
zh^yP8dIZN3Uq3(6)2GLS06UyzWuj#Ngb2wPc?;*@VCU&6)9tANXNuql<W6hyhw&9w
z^t)^G>)$AgW~2{yAo2pa`$YJ6;!x<lZX<wS=s^IvYf%IxTPcY8TII0p<S{liz2|a7
z9_Ad-1Lz$YIhhmR%4SwIU`HUFjEIW5N2Q(uME-|{i?k9F2~#pY-x&v^fw{K!49p-U
z(ww8})?&6-=!i@y&O$LG397!~-HA46qvc;=1dZPrjxbsA7gA^X&>}8;lmoPH5$48N
Kqe`^<&Hn<WJlYNb

literal 0
HcmV?d00001

diff --git a/docs/imgs/warmup_cosine_schedule.png b/docs/imgs/warmup_cosine_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..6d27926ab10e9d2649ce3f28eb9656ea7cd3e9f8
GIT binary patch
literal 17335
zcmdUXhd<Zv+xABxWD8|)Wh;ATOR^G@?2w(2z4s<ELPA2al9|1C3E7(n8Oa_!=iB%9
z``!2RdhYuVc)V2R^|{{Xb&lga&f_XfO+^6@hXMzMLgC#}l+{3?E(XHKtyq}wcgNXg
zH~fR{D1Ap03*J1jOhe&wY<tCrjwlp?5%LEuPcru@d{Nj*PRHrKotcx1k;4;|jggbR
zwVjjoQ)9a4PaGVd+S$_G;^N`D$w6n~<YX_x&HZ0Lz-8xP&Mo9CT8Ki?q3*~^X}YGZ
zPq}#x4cAGYjjYEzf9z+9k7DA$#<m)j_C;Bu)X|c%3f0!?wCW2VygIg$dt?`+Wisr+
zT~3D1_0rwuWrZ&T1M%(ez6rPKm^l0b*W<e+9-5z?oF(%Rwh+^GnVRW;=`f+W<tSk;
zQ5&Kv(dFjTf0)Kdj4p*DI)8JcDlNN6$8i3(Y$&ZLi@YsnCKD3>^X=;_f#+Y6r3uHT
zKmSr(i1GgyKlZjC-*+Rb&rIgQ1JaX|6FTCsG-`b02?oST(9t+KIlo$#6Gn`M3}+-0
z)>OTJ&n7D;CoGDKSRzY8TVFrAvs2MLWbQkBS4dJ)615lKHZwERH9&~0F(n$%<eQgw
zUExvE5dlfFzKH7<-L%^>_2Q2oSv55j6BRZs%|Vy#4pS~(!l%1+iv$A$177Z6iQ6aa
zNl&74L77<}MeR<%XkJl~yoLs`R<VB6&%xY%k)nbY+0)b0lymdPJ(}ZGKaIKQ;E`Wg
zh-cCs*>t?wFjH{E%*^b%)pUv1>&Pih#2Kf!{)kQAb6d`O>|>j{`Oc3%oqA7U@7)4}
zxuN{~$r4^7Kl)NbK7Raoc0}>dQQ|X55u%%4nzNK;7_jlcGVnfpVm(t&z1*Mfn85j{
z0ykGRqkSM-0Sy(;s@i_Ay-@o6m##v3YpX1WUKQcPhY#z*+YEVb=V)YBSEq=dyT>Bg
zXgjqK)y;9$Xd3bG$jC_BCyqaQo7$r&jRaEZu`W0`I+hx@h1)vcym>P=KHhQglp;$0
zYLtA3M?llpO27NXS4H{`PPLZ@CXIq+Is9dr{1ut}WpNc-9~aAI6^;1nu?0)B`AM@?
zNScn+B`Up%9W(Ig$=Q0KIGY{YtW6&jqlZ(PEW;cMZ+SjE^y{o5>G%{;?;fSdXZy7o
zOFe!}jV|RoY*}YSovz|Ccxv|KiP2K`%TwZ8yVI}ixE6|M@O!`N)4FN<=Y3)gtH-%<
zlAwX^+C?<WDD$S0%5VB+UhVo9KM4nP*B-)I<`BJZ^*O|nPL9f0#{({Q{&gZE+DydG
z9;Z_xl{IdAFSYfh%*@^`oDN@0wwV3o^lE#=BnRc|lR*~2^O9IV%J&}XgFw^k^$Apg
z{cbs`Y#1}U4KsTT{>I5}3sxyR9Xl(@8nl)P=?d4%#;}pU%pq|6MKQH2Tp*tGh-{&-
zxrW{#=t`zHPT&z6;p)upy316HHd=Ycg{FEO#Zi(wu(ic!$%K3HB_$;^;$H4`G#dF=
znwpvj(b0&*7K5GZC#+n>gQU@dqzAZp?{A*`nz;GtDrYFcZc_mP%U(~ciWH&Yr%|dd
zqr17M)#YnOE1%xv_XWPgzeO=|&%&}&QqaI_gQtb1QN7`9(XYh6uXWJTkJ;YfZAr`e
z63?*}(-g~iE~Wj9fWzqQLk=T9KmTApg1hGBujx8U5*B4&WtNMmw{PFliHcH2ka0Q;
zb@lhR`(s?j<kMOe&u$Bsq`iTL5|d=CpV`<X36oXpW`cLD?Yf=ID1lUzOzyQ^8<wH+
ztLhV$vNjP!M`Xu4!y)x;p+^FY))%Or<o4Lrs#a^_7Iwd;lgD(YXBnmBZIakqrlZZP
zeX@U07zX>Yc*9G-p7Z^?mR^lh*LMBex9DzeZdf=t(oRl%`ff|4np#@(*)b9q3^UaK
zobJykn3=J>_Sg`n5_i8sDQMfun&CM=ihMDV<Dow`2@4%RKk3>?u>!<KOgYZet%DOy
zQcGNUixW~E9+kCqds`(AGWy0x(I`9Nc^NOeha<-2nVEfo6ahpp?@{O3Zrpxw|I+(c
zBI?7tx--UsUB)dhkDlx0+cnI2YwF)@MfYG7={(l`cy%$VO3h%83(3rQ8p>E6KU>2{
zlKHI?!O@NnyboGvjR)V|gN(9K;caMb&GCF=GA!fo$@0Nq#u<8{R>c0a2SJSZUUGK-
zd|QONfkDiJ0<BC)R$X0Pqt%YA#>U2&8xiWoW7ZeGQ`h36Tm?AEaPnKbh<5SvWpT?J
zh%J4X<>lVn>U5ZvX10HQs(K|=RBigJ-?yXZ3H+btl)OYOA4OVh@GL0H-Rjrm+drsn
z&<J-e=MD}&V$lB;aiC^^`{0;??dxR?Y}R^u$g&P(MKUY%Rq{QJU0ffU9BI>iGo-`Q
zw9Q&nyXaqNm#1=Ex3Z{Y4cq;7-Nr)ZY2B%poQbHl;2|etd?*jo=p~uYFItI=ycMx_
zWUNC8NaSEef_8z8Mk4}xb#5>F#%a-mfE&EZm<=tDxa6dhC5fBe&zd**9zDyc$4lC^
zsd{VIwlr=Pqf&+^ccgXY=cCVF@?1nFdE?~R$fEl#p*%iJ!H4uis!f@>NVI!}V#C5|
zRo{FkzsYwp8;hk1w2sYHUCB{)XfYiyBT*Zk^y?|#lihw%kFo5`S(Rrc1tHsPc@d`o
zuQA^$1N%>+k!&(8am#7khX+QQZ*iWj-(!MJ*H|fIg}KjJZq8;eFuvK-77?b{rz49X
zytWe6dE4$S0fBSEg!KhB3Q>D~^G3JFm$_DS?N!VhU)BilRir-qP=(eJPD!=<@+Pl*
z@Ue+zAkG~kw#aZbqvY|VI}G3{>fAi^Of^wiugKPgb3+u-#RM7qVcX)RI)N*OcCwc|
z7#VN$s!u<y$jW<1y6lX3Me~}P+Hj;l+r{yFT4@|s&0QZ^6dng@Mh4pZM245JhMQzv
z;8Ff{>5e2Pi8PXF-UC*cYOe`sL-|I=y_(>fyM3x~Bi>N+tyl62$>S$lslG321kwtv
z+sC@i7PZIYTOp_A_N3a^oanw-VN5b~GDWq^T{5)RM77Iby8WwTS|5z^+Zv2wn^{wR
zF=P3`1=MTR7#9b=KaYt^v1Eq#m^sLzBT$C44!YxCHx!A#XpWM!!J-NgYtKGaO>ZA|
z-sX;s!fC8pxefo^e({}FDaC_>71`_%C9>>X3!}DS1y1r6%E0Xbzkd7>cPWTZf!0f<
z^b)SFkz~GgtZ;~=x!n|T+S7sepWLg;vy+lDxddlWvY^b5I=*@JjxH$+K6Yy>y;L#}
z6PA>OgSj^~Y&*-l%^B-4)riGU$v$ecw{_(Nh4PorPI}RMRlW*|GOfyRrh^Um7Bt;R
zR9^DWYd$1dolq8E#d~lpLfjfA;}3`4mctDR>gd`;)x{7fu7cYmc6SFXKN$wL6O_s4
zrBmbY@=E65-VX^jT>VOj@}<MNZntvV_Tk0oB)<99mzannC$?I&v3B&Jp^VOylq2z)
z-_rXbyJY0^CUpEr0o!y?N6heM(7G*yzyJ+(;wn$&5h&)X=w$PJP;P+%4TWdJf;c)!
zPLz=TH{Y_CeP6GMY`6Vn^L$o=gPJW)Fjmsz(O3xJ<(EKj;jce&L&7<!DKK^gvm*Vy
z?BfC=-Iv5Mk;5XQR1PobXoYXSXr}5h_>{`g$!^!&phWR3>l0daP85%mZpr5o%+Hst
zP^cye2jwsVc}qQ>ckP!L1zJO}5$DCzg^kv?Sdx^~TjhtD=e9|=WAx`oRO*H2Z__7J
z*wLC<mX6z3bo-|DI19N~)BLUZ=6fVSQZIHtiu*Im`+);?9@t|3<GN6Fsp<X`93+mX
z>@}{DcTn0MPZyO9>+Egc$*VQ(e8@{Ffuk>&InDdfTIlgWVA|bXAzi(Y_?&w@4{fGa
zWii~0#=ojTFiKf^VnI@7mAfk|%ja4eC2Ei0!`B>McL~us^x^{sm+rY5eOEj2ag>Jx
zcQ8sfV^>KOn74Q)?!Yy&;N(WaQDN2K(8hrH`#XZmZIN^IS4pm+Zg{+fx&$EVwlesS
zYGS?fhbkJ?B1z0@%}7S>FY<XA&13X&luvKz+Sk5CyA)yhEH4i~=}+NQ#qo_C#R!6?
zKXdfSyc`KFcf&$EO^|H~kwveFtl{fnqQv5eyG&z`(qg2;A7WefDwS}XXp9Z{waAqV
z>1c*9Car!QvTZNAK>eI_1%;CFujBR7x&AFA#CY|qp=-|}dn)3NWFD}s;@5xDcSJ7!
zD3U-g#TwnGy~WGa+5y|TR63511b9=#-gHa+lc=fSr0yJJ<*>``>qME!>2IW*csvMT
zUta&sT&#|cG!?Jkx1Ola9Sjn*i;7bHwuj)LVz+H<iYPz$;ZH*vMLku~5p-m@hsAHk
zn#XITNVupm?ExLiMu5$vUrXTEwr$g&lu$txeU2%0tS?kF#0C$<q4nd3bppQC`&K!h
zdf9zlw74UTeaeHti&R6IlEQxsEI>tsQ{M0`Yyb-BlxW4XOZaw*^cPCD)v!4P(C#k;
zq~Lyg!$=&)qr4DeL?H>mA%+VZtI-Lu<60^G>rdlc_|A5phDG7P<heg40h47c)f_>b
z;^Ly+ZSbxD(tj*9`T5;*l6Upx1ix6=&wj5nA$k0{qHLX68Fpu}b-N(>pMxK@G`Z}r
zKHCsz-+1Peg-H-OWC&pul1NMxch`{8WtNBD>6zHWjVaoFvblzj+zh;vw4w%S?bBBr
zo|bdUuQ$Bun09oh;&?CjxodUwM{X>Gm+W<#tNBg6M3&&vDv=4Hp({TST>g(sc?>?S
zVE9$&m1lQrjzBfK&A*fgOqKV==~ZE8wO+`2GlePns``kflXZN!Trx3v<R$rW4}x+u
z{qvRb&JERpKjp`7^_@;YfJ`Ze_{NPJLQX4R@2&Nx36n6y<#m{xog7Asd=cAGj`?-}
zo$|HNV(G_cagY2UBpEK$^7v?0m9@~X-d9RNFC*dWba&#5Zg8iqI5TBS%;{j17`jB3
z>f4gEx|RA1Bl+mNM6b^#q1I+~XfxYI*YsEblaBTpDf;8JZnqM-4Wf~4aB||UeJ^aO
zb=y(X(Ft8yS)tjjw4K6<?^<*<0DQx1(mor;`g%pnZez0gV5^0IdUMIa?XMU1YCB|<
zIYVC5Z1vC5SS44lz{w;Xj^+a>7%}**;cnR1i)?<a3V)EYeaMLO9`}0QO~c*S3;@JT
zY#pxtU=A@+ejDDB2LU^IH6(KH+hSKdulI@ThsU44t0yKWhX!3HXJBVvtH>~@bLT(X
zk^D<CJUqO$UCQlx)_d#AoBVw246nVh#~p1Oe?C_(D3j_ROx!?s@3A*+vv%!q^9mHZ
z4;l2h|7R&A$B=MKBf#PlpV*jxFon!Dsn02f;RovUE`mXhXpthpIjhf;qTJcn*A`C7
zUh~r2)Rc~uHCOw1wCWk-YtP?z6mGf>pPe3OY84O>68ir6<5audPN`Nf>_sLzl}v}F
z;{r&jiq9D<Uh206eLaq+o$-}=L0%kaJL;?7bZDD9BaZr@Fa2Pvlho675l8y;N^<<$
zrrJe&<!C(PVi6*uI7&nID)QCURl{=K%mIs<msWffD^4{N-e=G}3VG~t2-r@J7J5$C
zy1CAWu@aJy1iW}bT)Q`Jl`QTd0IgxB+I8D1?(6}q?CtSZLd<|Uu6m*+y&?tgC$|!A
z@_Gw7hl1Ns@OfU*5aZL02$8oMcv>+(+oroNQXsQeO)*q)b$h$?rl;ozTxiuyBXL_M
z)r}x0&N+}lKyrm+&ru(2c=7qu{aKxU6-6n?AS`?>p5@-ER=w-CamU)?qN#E+4;n8o
z@4@dL?(LQAm}z?hK=L}(4oi<^RY#6z8=s&W=dsnXe$RQO$_i#B+=-355*+!KCz(Tl
z3L@w6p~yPdP-f{=WFKpH>7lGJbdQr6c})S?&AcYASXVT{%4App8|WKLk6xyyr;k-+
z5_iA2>+Th!-?C|GXV;ON;T;ZGd}?n7n}9$;Ss6DxJls}y`QL41U6bK$P0b;j{d*%<
zH2D|Q)Tvt;W@mCcghi<slIW7cd94B|02*uu^Iz+@OR+Wl5Xt0f6UMC-_g{`ae^2bK
zbmOLc@LVSET(STk`b~UxW(Ed|A?OeN^|%z2lyE+lm6ciyFetb^e=b1Ap=00k<?M9R
zCuG2)bx>8(Yr-eDzMf`jX$gy<;qdn*oS^x4EqC%=il#+{d_IvuSyhP+?%Y*mHddyl
z@Xwc#Dfu18w3x_mZ0^lp-#sE_zS5c)Pde)G@dDcP&E9)BEi&2V$;2HV_z*pqjQuyq
z@1_dcwZCA#3xo#kzxt?-U_44SH8tPl<f|NyYpwte5I3Vr2O<Md#@5f=jOXljHa2Z*
z#SPING#<Efb8}S=OE~&oyAKjxzMPw^cHG+8S$m+D=J&HdLGY8wkI%<O<=CU!Wlt5O
z)r3Wv7%Gj*u!YpeL#fT*m)tT4srhcTFB)>t%_}(eno5O=7#-UjER^*KNx>x{B}ru5
z`jJOR?jLl^aUhHpRaDyh(?y9$NG_tTv$F^2OrEapw?&XYJJ>Q>pRW7damGZ9{d;F;
zdq9CU6CxdU2Q3<zfZg=unrZCD<X;-L;9(ZSqY^@v09h|(4l*MRYeJSb%sD{<*13s!
zch6_<C_eXVWfu)ECdbPE%54NM+TKdGkBpetK-{^%l}4Jpqky$;<f~cpP~{%*1T6-C
zy>5OB4hFoz5Mx*|9_v_5BdVf3w|0RVH=V3Qpin97FuP^@s|BaL!h3gqb#fx2YDuK*
zk-$pvIs0d<#|&pBl~`RwX9x2^Ppw^vh5euh&CJqW7rGw>b>S05F>?q2gYC{yERL%M
za-Mm`B-zlqn4A2kO1bAwnM#d6I{rFp+kQMl8v;R^PE8!wvBvkP^shAM^R)bCC{|9%
zh0!+I44OJTS`i@~7Ma`HBD_ifM=_DzY|=BuY{ig}T38E0zf0*D-;D5eP4)eAlZ!mP
z_wZ#>2mHmCEw{$<m7wU!Kyvuap1R%M@2D7cx!u217k(t9`|gkR5+pC<{HqGTZnJFZ
zb=l9(D!th7I5v<uUnY&3m}pActAI#!r(oMnpHwA5K%{$#nAeOs(0FbXt5-=)=0>~_
zvaMBB>jiu&mO!p()}CKpueHOHB>H4ZVyG0fdd(PLbOs^Lbqtsoz7xS;80{b#bj2~g
zrNq!09iVY_BnDAjs?mQ8WP6VgUF=}mD6K$%ZpUxVb-UgOs&D9qKOL4(u~n0~+sgxn
zVjM`b(KbNrh>7V&1TB2JO4^j76c##(hU9td%!q|92dhim-;IA}#I)RHX&TTrV{Cl1
zu^t<`LIYj<nR5<WMn+L{|A{4Ec`Eh9ytHSDy&gJQRK#t~*h=KYvOJoQ23dM#-u2_;
z_`V>OO#hPm$#okn?;ZOWkYlqX%s!6VZy9q+^c1wdcI02-o1mIt!NYEb&XmXQ3Trk!
zg9HD68Ko~h-=djulh#j`qq)hTALyT?G8yQBhAcnLu9b;ci#`ZrOj@(wEl$;AYrn|I
z@9Lz7ZiY(`q|ET#_6^WykC3IfM5S!4rkadVjf^Irlrh7^e2b;SJ2V)!b(EOir;2XD
z5mmhY(Af4$%hT;0g~985c{l=sn1T7y?Ks8gt>x2T=s~wffk|-PU2Ioco8VcSIxWF&
zELs|jBW(Xm$)L)FbpeX_sQX9a303VZ2&_>7&0i4;9SVkU-WF~HKnJP)w}V<A$yi!g
zFnN+J%y1>uo3CqZj&Oj^5ZVkt+>cbW=%6UeN1S@g#9<+hm-TG#7|)B8N=!y}(fU88
zxvT12D37#%@)39$VuBd?+fB%6V_pOD%3w*`WUWid^U=52>)PF|$?;c$VK2txUkQus
z6lE8gkVW({nJaga(%5-)t*?@h`rddZ1zl<=CcRG%ZUn2m$#$y+IN!oft&+9)!?5`p
z0r0_~FFWtw?A<qP(&40`rVVSg7mx?YBwiW<FnRTk3zTyNjY3!Y;k3!Mmy#55n@%|_
zf8RmUKCsONgR~;)1?M%&Xx#mdg&T25h#DFMAzt$v_rIRYg_0m?wo(*_h;lwaJmn|V
zhMcTRn`x=hUkq{R{8}N#lRtn#a*T-pvTwU8F_|&vRR>}asY@ajw(q=NjaN1t92K&D
z_nw%bcyt#X(G)+lI8aPYnSwG)`vST9HI44tVyI~InYb9?JtzN8H>wb=&~AyW_2Lto
zd5AJNj`qKf`&A(;d+g!<AlCMS*{z<kHwkx;=&nwQ;O88D@yM?*mrZ@b*eQWkU+^to
zpbSLP#(7(P9!bWmSQns^3B1hDdC-<dbnx>-CUQiRqr=P`XDL<&&GSd}Ib}uoH>fj%
z^}6YEWMy5>+Yb8dMY27EytbfMWCxwhuI>(z5I78itS>`(*`=bO!GhS^f>!xt$COzm
zNsu1gWh{sv>vx<9T1Wyh(Ewe?KMDMK#0Yg;B{QC@tviP|zt3A_6@mwvgZ@6??G^Da
zzZ$@%>3a!ca%GL@s<7~HFf*)T{3mN;2tD{7Dre<D7awZBl~hJp^xFZti^Vw(ElT{y
zs}vf#U%eE6_7d&EBajHaO$c2v`-h-c;<B4UJMPW8+%C^^jVVh?ijBzRJM>pkYT4CO
zkzo9}jA~M(ONZzmbu7XJK*e#Da)AHs5G@0dxjYL)mS4x+co)OISNx$-Q9&B{vH&3P
zsYRRyYf=SlfVE%UURhtSUGvgVZ7VN4s8PFwa`l!^i-*jydS{8VW$?c7)|l#G39BO}
zV1AijxwlO|UrTi;n$47?X|I)}YKHo_A>eoerZl$=rAN=f!SAJ>?E-^SoLrUk=086N
zdF`gfpwLhKxpj7ShOnJ`YHTm}KdG+TwqzP(U9k7bqG;jsM&e~9G5GyyiLw3#2d&=r
zceM9fWwKl7lfs|rkND4cx#8dcxHmuZ_c!;t$RoV?wjb;E(4t=4qG{HDCFXXsJ%%=>
z+-gLmo0x>8)P7Fd^vRRmyoRTPEF1>)RQK-P^Q!nlxxKhOe3Yo7?6F3f{6XNv<RKey
zLdZGDx_kg@QXhi(bGwU9`(|S`7=D|+o@}SKSf^jdi0=xVoYXTlHFaYD(%E@iMWz3q
z<jDAVIN%*wdHMOw5Sr%v`?(u#8BTp63Pp{k<@C%fu@EZN94{+zC6lQFY$WfryzX^6
zX-L`rdf-B<A>y)2Z!M&Q>vebemkYeS(<MN6(lGV-pvBHdaj()LO8z1YK3cvoF)<ke
zM?6~ZB`R^aNOJw9(%YPz%)GpnqJ3<Xd^QngK;}dXNjXd`wo5fowJx!tlO1a1-1@Im
zebY$y=eWP9G`E<1ioM%br8ej<`24ku<#&mbiFZD81t*qQIkn6~fEBUzEsLD?+r%Fs
zLZ5pl<N8iH#s9|rcS6&-IXOlr2ipe=w0}c^iry^R!$$oJW={ofYDoy|2Pdjr27Vk)
z@W&o49fJcs(0xB!Wa1>mu0Y;e7jm>w{5iejzAj#@;9H4%x!nSi!j%_{h<}G};KLLS
z3me-{mf`1sWA4bB$DggKNF9!L;K-@P16@9U(bjjvNJ&YF-p33;Q&(57J7u$Ng3Prr
zM~rIRxiwEuRz?KW!ri4Ov>HbNUA$F#6XBfIq)6EikzqY^_;kfbST}r*!l0vh-%@hv
zmx~h6i9s^*9#ULMjLgg!V`F1e-DPEEyw0mb`oCld(HEDN=*7fNUhn0rU7y|DocT5P
zRN_m4Y*Z6gRKtbK?N5QqBO34xBiL=@US^r?3?DmK8N1_Z8VRKPRdOrc?H~O+ihbiD
z=7Oh}MTL<^bPp2`tV>8q31hySXxJQxt$6QV+tKctx|UWj%oNd3iCmAvLqoAT*59ye
z7quqbxX)uf%5xPLvlgt~+0)aKarP%ye>2|w;~4q}Q4536v$0P)*WVJeDgcwU#RM4}
zm!K`Oh&~5uY`dgm&$Evou#fxKN;hk1*toBeMGntzCny55{7ey|bBYkmty}BLySlbk
zYB|iNsjbZjZI!6YhAi|is5c)z;2Lk2<Bz<1$8vJeAsMWa8=#caT=i_herr}r`4wOO
z$no3Pd+8$!z_WVe+<5(p>esq`XlR<s5_8KwtHQ!@d^WX=QYDay&W@}SBD&!*Md0QF
z7c6~{M3`k&g`XL*^NC-V#vnBSSW2Blni$d@VgxV_yfH1uMhFd`A6kYZ52+&`M@-&r
zW$3+1=HGEY%ibW1Lg|{qL&AaC#iurU=Eel{)sZCH6!ux4h#bPMN-9y#Bm2PQ4ATkx
z*&>ZlVYunqAqiS^m7j=a^zzyS$t^Q~znF{Th1kw!M9!u2J}*5}(smQtaa}+}yWH&-
zdR1B3y7}USx%@i=u9g~n;Z~1Im)?3Mep~&aaxzn@f&9U}KM|6^MbSZ@la^9T{%<XS
ztnBMx8TrR)xlXwT-zsO%{z_oQVU5P8M8ojJJOpsg#}5K%3fv<L8P!V%t2OjcW@8~Z
zVsd2X&t&nSyu=N<Dp9)d7{XHp?moU1ArYHrbW>5*DF%-B9p2@pPe$}Q!%y4$;gng6
zC^c4#6XxSe_kuP%?BP)3CX5Nds(}TpH9u68eLQG;wa^~<+$3Gdz6e;JDDDwkzEw-&
zB#yQ4LqB*Fg~P9F!3hqX5=40Dsum0QWI!{ev8Hi66#ZfZcutvuIIw~HuJdytNC}%z
zOF!c{t?(I8YKWM)UZ+`xjJRt4`fEEihmdIkX88~NTW3c$*HDIIy+W)CfTga%d0Vbf
zXw)p3VgRruI_4fSmP86@HQpW&t}YGkAD{nya^gMWhC6a#JaYFI9a&kEl<Xo0v$G8X
zXC67Qt2s}_emhX36?;t_1gtjUyLoug8kddfc4}pWv~`hqDAK2;4CEogAQ-4dHksUr
zLoh%nYSj8dG#O%M(t}lN=SupVr_k71=%N2$3Tjel@&k*yOjbvUjv^G@hUZ65_gY-`
ziXV)6Ijw#Mz5aS^&ygbVhZ+b?&zHq5k`6uF#q3eh4%{i|jxYFh_#}v*Za2j$9f+od
zx_?QZs0zU2o&vuu!;a2jIkVB_h?io>lIm8y(X!7XKj%S`t)?{-ZIjDZ`b!g|XsB|b
ztd`9)Q9DRY5`!#qz<A&QD-w7A90vzrLEDq|Szmn2Su5lT?Q*lh4Y_{y{idXr=+IET
zU}!ZSOb6PqvX=tsL^079Z3E=J4j&is<&LLBfFO($WciY-t-yhdp+%V7HYGwwi}=1s
zS6quNnNAvG(0j)oc`m0UWim>9y$@{__en<`RS4qSyc;qV(ciXXm<&{50krJD>?1oA
zg%F19&^`52t0vk-td+W+@^<DJQ@Obb0E2XH=A+w=A=jV{EB$549BuH_K7sv-u^7)y
z+?eQy#y>{%wKGtDK1PY^Y&%fliLBLZP<x6ozkoUzN2%`CwKeqCLX7_g#S-4H1F>S_
zN2(99sw(gR#Ws`Y77)&}cR-<nhJ)z^qiX{_k-Cow__kwZ0V}x^XL8vQoh-v_>fhur
zdU}a}wh{-%xjztzSoze6lww>t+D2v=G0ugU$N>hn&WSK};(^<al>!@v?Z7-&PjtOF
z+!sUO_a%iOqdo$Sb8!hC9iZC)a;wr^P3FBT-y27r8|YEMDI&d1vZQX$0HZ8T<J5Rn
zz%}T-msMEr!r<4e_7o3@=ZwF6craQ+04$MkSmt0P&`NO1VcBiFRpYuu5x$=?>5=N-
z_@V^{Fv5ar46EcDL+#Y-=dLXB(ED>XA%Jr_ey*z-C$8ZLoXnESYX2GrGzaoXfJKG7
zmo#|IV$>Gmpj%e?cb~g@_rI1Ae)wvlPtnVY8)kpjM+36GRGH_l%!l<jzfAS^&f!ls
z;MGOCflN1LdWy8V-{u2^@85`va7+>?VmCzGX35me$AD;h%8LjFsRye$Khqw7_u1Yd
zL{BTQZQ3guyThI;LN(#<wU{g^LUp;_l2<|`*5`hxUhL;j&mN;8hQVjOn>!Z%v|%Qs
zMvFv?n+AGehI7MI6Q<sSVHab?@`JgalU3VWFXho)=jM2I#cs++wAxFugvfF=Q;$~_
z`2)sx42fS_Fv&UCH8&?BnhiQ_NXEVva!5!t@c8B<8zF~$d}s&fpiljj@*&#h8NCFu
zogIY`lfd9E`T@fq_%?dlO~_Fnbs3jqC!ZP|tLx50&xnZ;ODaVmT)ylg3*--Ez_1M6
z`wwYO<q=L3BHGn}Lm>Wu8-w6d#t*~1rfs)SL;SpL@_%B^ZAVMc26I&v3=QcnT)g<~
z@9F*maHdnMVmrOJ2*YHjXzssr{l5OnfP6&_UL0k2jZ8Jt0ZFNg@(DMwX{bn}o$s>D
zE?N=HB9uP#*a$Y$n+V?Dd)%Bhd}+J!FIDSbQ-jvFS7b=*#5W77sKy7@)3w)nlDIow
z@ms|vCI&mNeK!Jf_BscLtdo;d%+c}wX2*S1@gM-9()pV6zba?s<>fJPaBv8Tp8cY!
zu6A6WZy}I$s!i%yy7=1*^P&{tR+C>3$~PxpW|ng~hLMP9mjDDR&|?@jjbefRrF1`y
zMzmjZ0XSP^EJNTCu@zD$clF&~+sVM3tA&#g;Pf<V0V4H=h1h}r4SrAU?PwK{`uX{}
zf|W|0erllhXqo|Ec<zmvyHF>wyW}qG6}9XOdS1>^KA)MD6vNC6*y#{nWCwb9$W-tm
zxs_3cz7<*2*8||v_dM`J-g~8AWsX~Zwj@wgV3efHovh3PA}$askxq5rI#(+V7I=NC
ztZcCsV7z`k@6qA^z@^*OV`dCk1P)j@)e2t4b=kUynM8np<szzHkUkvp6_IdIgGMlv
zpi&DxrdOi9qxW?JxOWO*ghxhhscWD44&!gxG6Tp(cvyvTp+H={w($iB(oP!?&ji)f
zi0Bk*S@6BGU80D>6iF}AKGrEU#sDhR9_%36jflHgU%Ora{`&fj8%<Wlb-`n0=8S@Z
z<T!0nByT@_;P{|d{d=RKmrh;h2{KzFj6jq~%LvQ~eFIRYG(k};kvQzf^IIRk4A1u5
zdLX2!xvYpkGw~=ekB%FmY=M7^jKKM4=%7bV(k!|)Rp5uv={R!}b`4Rg5{KyEG9eoI
zP6^!lwWHf2fCNG2a~qIg{mJjUVn-|4f|Dcc#kKyuy~>E>%Qx@(NNVJhDJg(VI|^1r
z)a$<%Enqt}_uYE7pwXw<RKnkXUEEYuCPG!E6Y)D3-&#o1A1_3Sa5c9vjR_CRun~;P
zl2l;o+!9zqE-fuh7pLg{2Xl4zh^8)hW_RQB=6s*vOzUfL4<7eFE04b8{WoG=AyaMg
z)Be{72!Gd+Vu_&tQBerzfYk0s2o*CwupMV1_z>a2o~I8RZ3>Hnu;t)D=bD6s7Us(&
z+<oTvyn;7yZv}@)15+sK#E><CF#8VGR4@?h1SwONTm;n;^aDC2hE4Y*_xnXRXI&bO
zsq4><tYFeGSMgWH%S*hwr-z7!W}5XGffl6gMoSVNFCDBGjG9Q}8k#p!yQBexX2;~B
zJxm!E!Ldg$_&Z_BP&6C}q(|wlDRZqUF-$VUtGkSng{ARcCiRYGk^n9@zxFGqFICW_
z>jm?{Yyd&*t5@N>Ya=Et!FboNUq_DQ-)%x<wq3)ifWF5s{*zyJJ~FzxuS_KC>+836
zcjc_C*!6u*oyGP>O|S?EI$pACZ><&-X91`OjbK2A!REV~=jGRup-RG>PT%R3&B4J8
zi^+HiAp%lmoxT7{h4KC<fRN(LPI=a=_U9u-AttQ-n4#G-S$<=Gue7T-;;ih;p=g*8
z7;$sz*IwP--PI|#Bz<2{5SNw~nJ((Wz`}xwQq$7HEIk<ax@h(NgI||v^FoZ|w2frL
zVV~XB(OR(#LmZu?BrUDiAEH;UUJ=sJL_aJuGXe=@oeJjTfxpz8a=Bx{i&#Ey@$jA_
zSyR?an&Cf;dwB@4y4n+arV!BoP;nM@49vTqJXfF(&i8s|fQ(}`O{_KPn9>r0L(@XV
z0mr;7dhL|_LGk!hg6~txR!43-Qh3gvcxV%N$CBkV8V`sd*E<(s#(%FEbPvFR!DN?W
zFeIp1p~-4j*4L{1*<)&?!YQ%#C@2)qN=kAAH%nfB7?Tp~z#;&;7ar(z5uX<i;ve;q
zB9O^#E4{}>#K<pk6|r4CI{&Bhfh;sQ`B7`DNbT1aeh93OS^Z~Pn3x~@@@L|>Lp=}V
z9&}UP*5z{s+7u;_em(k!-+ZdCo7fBFk$fA2)z&*_seF~VlbN$KyGn_2gc?DzMGhaW
zB_9WoO9}_U=A&!HDD@pw+U~)LzNEuK^ir>j7Pm|vG8Wlo9<sevq@;*UVH`qe(mA$0
znin39F9nK3XKy>Ny7<%02Au8%tz@K#1S0uwz$H|c0eA4gD7l^Bubr6~I1VJ4h*NUQ
zXAoo{fjOvogK0oMghC-n&{6jKH-$iBX=RxA@6iwsOc^22EvSGMW}&5~3(s7;m=L{&
zABb)ccU!z^Er53TvTxKUlkAv#6(l=D+57gEwRQXU?OTMhnC_z7I$09)I5v7NO*GQ0
zE^A0;ew}uu%@l03f*;n#5E+Kkn4E{Ib^6Q0SCL|0l1FxlYiRzq_U7D1GsQaBr+u3C
z-~&1v3!Zm)cv6;nSenx}S$!@tQ1l`+B<AsO{7JZ(*vXlZVk-mHL7TFEIm|{@Wdu@v
z%N~Fr0!j|-34P7EM>kZiF|QK^GK5F~bBbZufzsWN1TD5G*I{qX05$OT;+KLN(p#f%
z4eJPLMGUJ(JDz*3ZQs|Y?UikK2xtK#JgcrKoOljZ?oMe_G&(PDFRgKD=bZuKsYyeC
z8YKAFa=h>B&IG3)4x@aR2N%+*@SqAF?`U^#H7eqPLI4e=Dd;lWnd44Xtr-FXcVt!w
z+Kn(v6kVr6G?2~7RFsi7o;5*%e&VmVXp~H-(Qr5;!(4)W`)EszDR`$Fbkqa4FKfHa
z0VM$K6}A+ied#;W5boi*&Kwac_96wDRBX{5=h2knizt|E^9T!EjbHHRVdlCDA|z$P
zut}<r$eOCA@pSzWOIa8jf!uVf%kh3J1?~E|=dOsbMBsrqV$0^4cr>XX(tJ~!SyhU_
z3=!!oyPh?HK%tfX9iJhl6*zBcNbFIYg0MC$!^5Hx^e?(RG_+oZi_~O3mQI<9IFQ_I
zf~57Nn*~vrl+L>^CJJQbPfglj-uKD5a)b9=AO(#suK=6x+GUbzA!stInp}6^_E|K+
zlr6KmC9+)dYK&WUGtvqJ|3MEx|D@Z@bK)i4YfwrF+~22>&DRNx)Af1Viat)#mP{9z
zpP&++JlSPvy-<VnLAE@vIeaAa{2%;>J&(1wz=o6QDbuk5i2?GEkTqd4=}Qq%s!$h-
z;#TlaCMF35kz?$gR+Z3Xy|MGP;Cv;XRE(Ew+hZdFhG#G|fr(e3CPO<oG2+W*gT|n_
zW=j=`eKZ~#f4*d><1nVCfb*M=`4<88qLpwv$Gv@EqzuwK*|7{0|2m-BqHgnD<)MEo
zQ4Y-^{4UZ$!Pw^?&HOG&oMIrT5h0pPnskn5P6V<s+?MV|>$ig3$KvNFp=pDh-!F`u
z-iixTQRg?0=ArUVg@+P4uYv1iKo@x)bKVJLmvbToQ%H!2+=yv|u0T&63j=ACVHx)I
ze7lyEaF#SKyY`>qEFeMrI%(9NnI%HS_TJG`pw}QwJ&uj(=_&8cNm|D8vrw!pV^s!q
zNfT*BFSEaGx@jZjjS5^%gYhQ+>{d8MheO&kj~lo|1aYB`t=|Em7OOqIsCXsf4<~5Z
zGTGN<pN~YoLzI4&4z9aydE-g@OvJnNEP+Gj->IuW{R+el66e5005XWvW0Q9X#OWep
zL+xGegjq_zmO%EKZeLJIg1Yni`=?US$^N51JR4tTHpV5`kuy$~ln*>aQXVt{SUcJu
z1>;t@5Cg|)+Zt3VvXRZryz2mk93VLddI_8xEFBmo4tgcoy%wd26f<ZT^NN<U_<KAH
zeqH+tM1&V?d%SgDiC6iTl`8SD_toi_*MpC4Bwr_<x<DemN}}y?i99muc2#%J`4xq@
zg8PchBfCuFFc<{gqMFWBB<N_kzJHp}s>jIL{RM|MYx<BxC6V8GBBuLpu#u%fYtRv$
zKr1<jAVV+jN;$;}E{oD3OJ77Nl?QJwqQo+2md|lo=VQp4h;&qB6y~%o)JyUaq;2Ik
zRXT(IS+BH?%j!vUc5M(Tu<Xdkt0N>~>sBmC2IV?eScc;5$|=VRLLG~}>z2XbhRG7>
z^yND(M=%{~pE-|LZ{Z(k+Svl!BvbG)BX8~Tg&!cJ;~J@$2ewdqbzEIzBpfuQS4Ju_
z+bsSi%&h5qjx!qH#^b}Bb)k;v+DcF-ot-R(JN?bCl<HFYV1DWDFq_VMJoL2)l4ug$
ztPNOQwPa#uHribshD#Dh_txyjK4KKt?NM+U)C1geMPxEMpDf}`K7&Z6+JX6du|Z6G
zyS(yiv1>j@gBh_234DYGxvDFVL`s39Ld_ojas9_zoT4;Xq+VzRF%`alR85BkdaKO0
zMR?5RHrI=+9GN1X;QfNRs@l|K7?5aG4c{CafaacKWB!D;x5%TM6WDGrwyTR41$t{q
zW{+!K%+Lg!1G>ZCOFCs{c;CK#E3ust=}i?R2nq@cjgH<>|GP4ftpi-F_wmGx@j#Z`
zXoU@B{eIo%+}2!6GR!hiv(Un|`~2PaF#+=bs)VPI_x|UZIloIZjUZK;U0$XTw@HoG
zLx%nbD=0{oE9cEYsO?=QV)|BXKSBcx7;LOozzBP&{FU0kX!P%Pd_d1G;9eQ*l%j?o
z$EoTf1pf;98q#7kYvKeRvJ=f0+>VW*f(r@g$DKs$><aJ+h>M^Wq=>y33`LhBOrs8D
zM>h9A4_ZK(1I)GO?yz?BN9Z>hcz6giJbz=uEe@@MHe?_KQk{#aNO?v+3Ow(>$1@9u
z%Nd&w^nluDdx_K6*SEF5-*9^ANUdPNf9OLUIJ@E0iz~f%a45g}WB@X<cmrYahf_X5
zup+yrnuNbEeUJYNUiw%530fHFBp14TZ{5>Y`qh;gLj~OT$xDp`k1k+cnr7~fKVWf8
zIJ}XwrKLE_nbdrj#ZPu!|5m<8S=8FoQJK-pEb-XY)z#3U$yNT2jKp1BT%6pxr4U2Y
z_QmIH=bu|KtHT9oD7eG(^`}#3XQ#cj_Vp`@s;UYqDk{Y!(&k5hT(9Yha6$++T@0fz
z4?*(J&}OSKQM2ZrQI?=d8McLmpw!4@*sCJY;SUEMRTL={5bZwRq#Tbd!S>A@ydaZ{
zk!SU}plVTrw|0rxwFhobFh4HN*?MsoriC#7q9cu<hY#XIkxzfQC*E0~0D8EU!Gqwv
zB6=uY^F_wttM8vj#aTJI{I@aEtBpG`aGORY_w$^&lgJe3nt@lndOCPxYP&HXg%KYG
z^(G!>%e}Qd*7&yIzv7w~kij2StL^g#Fq8;gu?=e*34W{JVBH=C<SI15d3I&2VW2aH
zn?F2GIyJBR_DoA#aA7{)M?F&EoLpEX^8JfxDNJZGv42RMazPL)V@*+=H=lC%Fr=!M
zq;u=?eR7X`yStmaU$T2pIy4-OdC4G>4p4{NKfexkljXVt!tSpM(YI-ZgWqqim-*8*
zT9jr^nSx}}A3vUy+Khet-gn#LKCMS1GQMM$0ZT_C0%!m|$WrmNC{b7?W2jWR+bB?F
zPZ7+#!X#k{tRo~Qz7Qt+En85ncw~QN?lT()$9m#!OY{2v#7`%Yz>tuclK!aL-`(t;
zy}jpbKxlY)O&RylB{&Ul9$`!v-`$%1D*zCJXiSt0I2?HQ_gsXoPwz_ZtyPR_z37C5
zE2ie=F6>{Rm4q8yb_Xd#LqjyXE}PT8@@Oe3Bj8q#t+O%<2|ksO{h-%{y<=RsG4@#;
ztPF1(m>yUjH{izE-9(P2Kx~qK^?%vWyY(b;UPS#%P!)Ym!NC0|b8j;5#fHD9o27<L
z7d+1nmq*LY3Gitpa2vhl%pu>n?(_-r!Vn3UQUKSk{*a}%u5KHoVKYx>;?G}9?9%@}
z1&Z|QG=2E+TI;+Ob<zD?l{Q4=X7KlD#PwJC2q7(P4D_g?-|5hO4bFZifpW7^)%!O|
zNAsW0E}PF&_^|-dfO?_)8MQR|$mC?CuP-VgDarBRI%2o#pn@G!K7M|woSI$bLMJ-}
zh3o3-q8e|I^qgPL+s#c(Bx4CYjYPUOv$D+h?{U1ny({&KvLyBD99pLDd}%S+tae=&
zCy%LHo(F2EUu@5x?oCb(hHhcSgw4aDePqd2V_pVE#&n8{=AyfLVtcP6KyJG|>cIs~
z-qHt?FRz>Ynw$`AH0)CbCO}Zep?y<(F=7Cnv-u^dqPq-nhCg~!VwHrCMhCb4`Ng*X
zyzcSB+nb1o%EHA;8Ts?a4HdgWB7MFL13_CL(!l-SODkjjEGm3hhFMx#3KMU|J9nBg
zWkP@!oeBVwM>$Q12p$r2ojO(a^jvyXUkjr_!S}^`?^a-7U}Hzer5Lf_SZr);HN!X9
z*;|0W!=<IAg>TKaksAbdnNoBXOYeiA?J`3g7EpsO3EQKei@Up(#7_SVV&UTlS5^wP
zM^i_@+`8jA@O?<{T`=6-({uTjdB3cblvJe`pMZcdkPd43n((gk(G(jxa?GQGuhb<v
z90$9_K2f8iI<9VR^juu{1zLsveP)hBbEPf>mk4NDk=+z(T?UN#MdJ9w#9qBf|KZ^w
zB*l=VByzAw2}~I6H<ZZf38#WoB~LYDfd~MNU|@5z^juDoKC!2#XW+MQ8s;JLE>A&=
z+#EvdeLYRwLkp&!BtO1naLXzvTrMmuj23|qAc6G)*VnD*K(*Wa;bB=IsD!b|BIF3s
zrC_7d4h|oAtiE#~-<5v;yyDHUWe~Z(Ytuu6h>fq?3mrl9GQn8c3Nd8*5IqXQF?Al6
z78ZnXbt=&OwVV9G-h^EcXj_fGycTcHxtm1gJng>5+4SX$>gdNO5wRH=+#LF~IzXwA
z<6;J6_k&b12-K(atgH&1U0p;k;^TutXxuxm8HBn21i~ntkcx_9eY8{tcs-)C?HC^{
zpl6z)dQdyNu1~dq3NgswbXB`Kk@FEcs=2~uqQ+)3VqtP}lEZ?JVy=rd;|k~ygWy6Z
z7C1)p=goRi!<qOE8WQOmc96dk!>@fW<e;aQf?^dE&yZFBZe$Gu)$Dii5>)3vAXcTI
zmSuP!<!7rrd6EM+l7b$8eva*L_&1voJ_grNNC)OmR_*!~X-CJMMsKtl6S!2{Tw>IM
zn3rB$d@7FcH#kv}f%n5?0h{rAl@rzi^c)<@u4{#rR8P{=)BQn1E^#mm{4siFNr@VY
zj!N`-ulh2enG2}7lf#`;a<QrS!=2^J;1$-D9~Ef5G+mE==3+?Ambz%q+`~Fv6)JvM
zCJ*i=ZEl{Mt)ius2<L^7+V_11j*BQH@*DN7b7iy~@Ynw?$3i#QR;>UbpnTuFp+nua
zwdLwHbF5q_8ny)5X<cXoYPFn$!_7~+h1uXrj)r^`pYf!!n||g>p56doih(P*4Ugd9
zkH<ko!J?e(i;%CD<rE9;QNXcLQcc`dQhGD^vxeFuNZRkupFg%8`Z*d#zZbirML-ho
zr<5~46DVap{^{xHXBTcR{aP%Pkk=6(B;CkMw4Q{hn7Z7Rf(6e%E9!4^b7eBsgjRb~
zZe4ot-~nh{yBj^P)mUkmU%FkCTU3M(Y&deqwfej&J8EbSC80~*hD(X>?xhBUTcLbm
z=ne{BLqo&Lnh{!;bd`~Xb0FnVohC^cFo`5&2<J4RXIUKrG#>j8^z<S@@xlPL1G!Ng
zO)D8qZs19T6LjHj5|=a8XUfxhZca`NL&-hM@|uqNVN3p)Q@TCCVcFHy*RZj%LDpQN
zTj;tFCAc+oUlk3WBm|!1!{ARI)6Q6$<Ek1?0fCBQnOuq1z}SsfpcLBECk&){viM5h
zbq)h8jTu5fTW>S9NwZIRio0cQV$aUa-Eo(@E^z9zi%h=e-jSo&UeOn`G9!>aAcYAH
z1tyn~l0y4l^f&^7O=NXyV_!U`Zfy*12uPeAn4Hs^aJl;T@82b!hYl9yx+hy9l1@Vl
zrTe&E&(iEZ#iV*AVv<8UElT!JRbN=m&@bM678IuU1xcL9rRW*Y9Xu!l)JsN2Mj*4d
zu(n>;`g^oMLq_&}VL7X`l*p_s1+1w9e)Pv(Tth>H?xhm_OQmPCjcADOB33Eqs|BCf
zo-Lc07!{*J^tu*uLEz)Zo2b17oY};N&6eh7wfpx6O=il=%f-6XGk&8`7fz7>7ho@7
zl-qmz@&}#L3lL^Z@=+8)p!=9zmi#LWcezneP{JWi3La6u8CgU7v+nh_DFmB-iueD;
z59wD(r`RE7H(%y9gxeCMm39U(bJRdpLl&Ki_HcIQ?|&`6CbW^CpO0wufq$iTe!e-4
z2kx7Ykl=U7;M4(hy2Rr0G9x$lm+fR8qYK_!e=A)&ioybnt~NFl7!PInV_~wBrc(_y
z6^2qf4f))@S*1m|I@Bgbo*ADSLB~~N_KZYoLT=Tt(Kmo=bzW1K{kq7(qyssZb`ty2
z`@K^sy4ky-Vlu>v^$LUuUfpMWLDHEnyEn53LaX(-+OL|}R;F%?6?c1op86OPa_PZd
zM4SJvyZDnIKYxaCl3lK0p(+`+Z0qkIs`P_v<@xDv)*C{;bRVv+hKox`yni43s*lA{
zh-y=xnfcNRyVghgj!l<evN;+}_Cm5O@lA(^!YuJ!+8GpwZ#3C3U}*@OL(d}Xf?=f;
zfbxEWKV=Jsr5k1xn25(7i4#W24{YuaSeDN^iR8P_Vx3lZU%x$;@m|095?ryB&h*`O
zjdZWWhpp=tzU!TwnQ1XDy?sTl^JAneL#fN1quX@8wv`dKm6sc4REIBK;!07TpP!e`
zPbZWk5=g~!adGL!ed0rr*u%_p$DiqrVPq!Mq!&y%Z0oliyuG}Frfd}$;(R%J;K#x7
zELo$`#R*>!u)kEYuhxTp0t+)hNgPdBC9<^cQMv8|v+ryC-+uCe3Je$X;{3J{{K~4(
z;Lj_iI#7SmAb<RH(k;j37smIIB-Gf;Ef}z9T;B0p5OS^4Pw06vy!m+wzgiDp>`TCc
zke-2oOyZN_-kqIW(1dlF`olXN7S!{@44;6LY=8%o)6?_6e-p!PbDEe-W7cG;eex-p
z>#8!Jr=~r;R9WIt65Pv8hQ9VfxX}xZJ7T{N#U7|oh0!(^b4y^;2NFtS(Fc06OQI!;
YDl*B~b1cJuvVpoIry^S<ZRr1h0D{d*#{d8T

literal 0
HcmV?d00001

diff --git a/docs/imgs/warmup_cosine_warm_restarts_schedule.png b/docs/imgs/warmup_cosine_warm_restarts_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..71b39bffd3daccf7fc89cad77ef8e03df40bf0ab
GIT binary patch
literal 22315
zcmd43bySvX7dMEaG=kDC(k0!cbV+x2cQ+{A(%s!9-Q6G!Qqm35jo<a0^S<-Wnl<zP
zoV5-jKHm40d+%RtLu92zkr8kaprD|T#l?i=p`f7s!S}szFyQOJ^XV}7<&}egxFQ_*
z&lAot82ldIR!q$S3JOIJ^8bq>{z5bGLrzCwRYwIIV@DS~dm|_-Jx5zh8%Iks{rApB
z_6}w?*6*1=GJIsBeQ)aMXv;-U|Gy7>w6Qm#=kVn&hk|+!B`)+?(KYR8#l<ns=-0s6
z=?^J)0r!{1pG?2L5(x686hW1fdn=nFPgfdT5!zhQ+_DjdaCGoN7*$b7wv47o^?jTe
zJbYWMw161gzLW7PKe-U<cPQ};s`=eCP7`DqWAAm^<lU5q_7h1XrerBNVt-Tt@TH2|
zJRkbMzmW|@_lNut{E1#+-h1@_e9ukxI{TjwqY5lgQNacL`+c{x_-D$0AA;&EfgteT
z2P(*;BK+t3^q6n5|9SRN-2WdRUFryBXAmwzQYf9?JvN3MFOlzv|L@qXlZFg)^78z%
zvfj^_tM!;m$fK$^DHKY-#m6smMEdtLxLU@><mKvh@?3#mV!!<yZfa_JvExH~a&m%D
z`1YT*+vvx@!a~Ex*Yv5RNr^=FM~?gcy))IOU0h!N9VaKJ=iPSVve(Uu&pmG{o0a~~
zXfi&BtuJ_GhS#-PflQ|NBr75&?e8)6HBt(SAn9~2MxSTTjg1Z5w8KX>ITsg~|8B;Y
z=!BvI3OswfqO!8^y*<-a@K6}7%#@Vj<Ayb9O-=lvp`q44-H1GAy=Y}xE%nbSv!yD`
z2K`9do@c@_v9TdxVFr6YvkeUmS!SdDvw5gjisZ3Fd5&-vx$?yo%>uW}4ijf!D7N5(
z_my8V`O<kkuD>TG!F{UKZrhtH$2)Di(SBkj;Bx$Z(sF8ieYA)?EU&2<9ux#CEG%3j
z@b6i%{I<NwMzqNEOXwXMT1Zq>$DT#QiZh;M;>Y}g0+`q#A+RzN{C^*X=X(bRYOR*)
zY4~iElrWT(l&=12X=*Mvb_x5TtIRR%KJ>(?;Mh9;{Jm2|HrNyLUWVe=ebc**?QnHE
z4Savo;IeLeOp3I;)?dD*+F^yYk;-&w@Mrb~=^c24<6QY92zso_mXZc8BE;iw)HSbg
zA6E#>X1hZeh_HU$Hs{O^?l2Jd_T{~|`~Wuf_TfS2lGVq@$K1kVXkuc(kV%gL`=iGf
zYwHt?OcyZG*879fFEMH?%inZl{+!vUto}MIn144r<>zhH8e(xMsVIrUBL5!2E+o_(
z`lc;W%-U`nqZ0nYnVftfwz0k)$#OR=s<a&k4MC6Bo8$Y)S_h|XAHT~?@jmvRbS0b(
zjL7{vyk9TV(SrpRc&Xr0e-fqYPMnS_6)R#ro$_m6n2god8jnGa&(&YA=U3#Do)Xj+
zs&^V(P-qW<liiD7yU)$-L#YR;%YLhBQ;{jvk!H}oKQnw`^W=)o&(7=O2)|L0^uGDW
z`F9vI;seiwIB{z`GP*P;o@M&9VRh)&`Ll+Y39;I;(#m?T&0cTkVZ>c14O2li2i5;R
zsi14hm3*Hu{s!UiD*;uQ+{xawQxs^_*{Sr~$QN;nbG9W#F=gEd(p&NJ>PjOg;SNXR
z90MMBGGN<`OckMA27ctJ_&0dkQu%{mQ5Ev))ho@Ksl>#@Am!4&r6s)BA%BtFrXyA#
zP1e@$+LfY}i82H6mqXV`5+&wl$A{e`E{?5+7N!whgJ-Ss&iPn1+Y31L1oZDK^Foa@
z6x8Zj<HzZEU5j+EdzxS6%$uX%Q~Aqkmv;?r_{Tzxwz9_9d3Aqm+R=1l^{5S9h;~ni
z5WQH))Ece06aWu9|J4B7#7rJ6{?xONO8G?A-Tvgs5rt+lY&&hC#+uJ=9f@UA$A<eQ
z4>K-0#x8PfV7%&1mG8CDY}sObUy7ERzdJW`wGB2&3+!e|;*ucoQOIMM%?Gdh%k?Fj
zj<!7u{^whgl@=G#Lgv3Qkr5F_J%PyK*o-;PDad$iw)0vptQN|Xwn}v{307ZT>;Cec
zOFTxKF`IFOb1W3h;@)%*lGm~zUEjlLUv&s2db3qS-x8c`an0cIq*u@{%updi8OcS3
z6!4_hz@G?qpj8vUk^cIen}8>s02_6G#8*rwqp6&rFpTDod>dVd)Wm_ZFli#T=9Y&l
ztg)Vt$4*ki-QSJ%LWbDiNVReQ6%@+#o3=dcSF)u_Kdznf>ndq(`s=$*S*l5nX>|$>
zFFM*d^RVJB4jN#STh=L2t`UsxJK@y6Vs%RoQsp8ms;dv5Z4SKH`&CEdusd$_GwZXf
zo7>xW@4Cy>Y6r*1cY09xUyT}2<mBWev033N6i5*>GDh!D7uK53QlEgp(#hkjzuag?
ztJ(OS&+ER*^-Mp3R;~LhG%Or4vX$&d&acI8<%`KfwH7xUrxg_Ii=XFPyH#Y>`$`CK
zJTEo43d6>NpkFs~pKtxH4-dB@8+-1yPLd)&aWz4NmBzT3`7M@#zkHpN2?+?2?1eSn
zK5zP)U23`r9$lTqJU%=beXp(}YMC0hU+WgaXECqmVTa>)8L+s2K0i>OI~m{cqbp7u
zrHAfZksG;8Rlks3vrWU_5|o^L+#+na6Z^dI36I^{5KKDJ)2rJbo>r$lEd-4sCLy7}
zKLYRYujO>X7oMvbg)6`POplYMkv9dI%qDL@f)ElGX0%!5fkDFl`CIcmD*p~Ug4{a2
zAq1SkB-S6i(=hezC%Yw#*mw3WY!^OU!^Iyi^l*<KS8nHnDVs+Sf~ufXqtf-3rwBOq
zSu8!Cjnx}9|4In$EZSpQ@^qzWC1Zz^&roBpF#GQY{ut;Hm!`z1w>b|@e{`gbn&kcF
zwz>Lo{)tB<HzX5f_v|`Klp{^!rEf<|hDxA|zL=u#Wk;AX*-hh^D1rbD#b4Ewi9o-?
z<n#59%a`i)<IOlXBbGx=%JHl)e2W=f+#UG2Rjn|-aw~pr-lh+D&7U#5ry{(3xHTeo
zA(y0bd9he7<HA*&%I3+|{}d(3qt;9F>e0(p^knZHC3{V`F#)$$clpIT+2Y@Y$<W~L
zyZ20(F{IoY%dpw}b&Ufj9Ln1IbfDD5`NP7<U0|k8aNu{3<sFiTX@Ao!#0O^FT<ll2
zGiI)hJ{%o9UT6mHDIP`2r^8!wZt#!p9F=m~f^d?x^rgXc0~eD;N>Y)CHa~W)Hg|`+
z?W#Sw>e%15f80t`K1);@lG3gYdz>mM<^PJRY$~_XC3QydgqM%~>SvKs_};J3Oyf6O
z^z03v3+2v6)_FE|$^L+7Wk~lKZs`N4qL8a-j%mgB12{})Ky)pe%<KpVgRVW@@7|q$
zK;1u;@HWFhp<5eTYyTx9T$eqmBLhc9C&A#;$kZzNzFeK<*<A1x&Ss7uXCS=G%I;^;
z-JLC=;Es*)=G(jP!V3v$ISB32LokimY=Ja(V6I&41c#a7xh9QR7_2bbxAsH|{1Q2z
z_zd568;gCW6oO9EA!ses7_qRP>%d&;X}#S!dzLz|(Pq8dYj6ZtgL&XWBISrRz*Ktw
zwNcqpA~eew@{yZ+c_5A-$=we9lj-iM2pBLL|H6>P8ev|)u4T<UMh>S}6>9{pUr5Hf
zwj-k1q+#WB|3rjL;pI->5_i^g(Sq$C7QyGX<X*JbU#?RGf_F&3vZ&Y{Il}W33=8Ie
zqk{gB@_2>sg!}9)D$18JgXrVJOEDg2H{YUh=j|gM+fk4D<@jl=82bwY!&HNsF>F1G
z?e5{g10y&@P$TsP=f7i))2O0nk@Z$Wsm*T<Jz$`wVcDKMV%5v_hkfDN-OGrq9yK`?
z<og!#m)v|V#1{pq{o`Z5$p)92hCYV}?yb!Qon{2*y7*%m&B7G;w5GDp5%ufFv1{w4
zBS@60*3zK)nvK~)dt~HF_t3aFZ#<o?*iBn3Mms00pIQ==U1-)Cq2^SVFK0-bDbazu
zh&?l2xv1Ts@EvxIZ#DC+mWDn!wYx4sN!45d=Lf%xN8bCJYJ!^ij3T9(f79Gk^U9N_
z7)#;c%_=19MtP=@dIfY-moFrUD3wR=DHC3X{IR~CCt9A&tRy@Ywa~$`52I=F(fhD=
z#5sG!>E9>^Zd`(uIJEA8Z?G}H@chu{p5=$!yN5rKrZd#-(wu#JeI3y%@{47h{-_ax
zpl*K_a=0>i#3>gU3+v%JKb>UcFr9sL{lk)rMKPSfuBS9yVy+gQ^d^a4Q_dVo$U;X2
zvW5-LmxaG{w?dSLXn#2h)VIQyo*UV--Q_F2a_3;;;S!5RqK*#@x(~P*+S2}EJ6FwK
z8tj2-8JpokT=pFx0nN`2Hzt~6T8wmRoK096<6PH<7>xQ|%F=06N?Zx3$TjXJ<iZ%N
z{%+h38ii28wT+wysvK+yK@Z6)t0tBFqo723zf>Kd#DM*Uw!*?FX@(ykw!EgnzU@Qn
z-NvLh$h@&is$IN@_0>;#_-rKptkGbDX*kw0-|n9de~BKO&pa~$>FsR$wM}a-CYe2D
zq3XV>z+*A4Uv$2P{ELe&Dcrhq!3|b(40@E)LCGGjEcvdWkbRyzvzoL|)U>OCeoc8h
ziR&jtMvfDx`I}IvC!5_^QJ47^!4!S*b>TYog8i*<dcpuhZ7_X2nn+^HVl_u>{K|X!
zh-WkK$>m(K6sPT$ofBVB^~c|1Ug7H|1W!$Z#s&?V{7p?md(Bs;k^6ztxf?zn4z7;d
z)+OfoqAIGO8t?T;9WC|SeDGf{nWs+Mb6Kr$SN^g;Hg^0}RKm)%-J|8%!LI@)5)AVl
z%y*>9|2dq0^SH!hc4pNP`7Ey}C$Rd|CXtS-`$Mu6Ip*w2C*KS`zG%A0x=<qS9NBl_
zO?|tcC5zyOD)7bmrTQEF%5YEMP2RQrS&}TvF3}gmalk^0k2g{^{&fahisyRh<lb-5
zA2mS|a*8^q?wKokd&CC@V%;Yxx>Ki0)X6=o1Os0ohSl!x;k?;R_`Q*(XIUu=rZ@>k
ze5b3AlIDo}E;3_1dz*U;+e^<5(~9dw=iq|!ZSy{cw1e9lb=JRx1U<YfqXtyQWzy@l
z&34QJa3hAvF3kdzgYO=|>x3(}{es*Xiqh%hSw+ap7sTc7)$$GC<!tm;kdKlwh+R1j
z(WNhT@9WO=`g(ZnW-acHx6}%}lZ1>S%Fq!;Y&jZ|bfVHemw%(O8zyqRHu-_eYO{hh
z=GP{`ko<eG&cmoT3>8xfOz)%a&=ZczM$z=2)dkg!U-z=wR=OKC=|^KR83Ng@aib28
z&gZo1b)i8IK{Nz~pWOpJsyEg%IJiw0?0GT=_!sG5$Kb-M8c*8HOcmej@qI)nY~rd@
z@8ds>)R&TRkh`5ZK=<Z&&l@11`T>BtM&EpJJwiZ)*|1yL$2k!eSEAcYIQc#;8n7{L
zW79%YK=wI98lljvap5N>ZTe5>HsvvypmPvXghk}xgpD#wi*)o-xjJi9VCnPZi*M&w
z<d&yrTfHvnGf5$v0hWmP`O`nsLWWFhD@$*&m$3I$DoP4YgCjQx_Ywo|oT*tm!cPyC
zp$pVH@h2>13+vbs2l{fRi*BuL-b_@k(DWPz=qFf9=g*nu$oR8Qc4bHmG?AOvg>v3F
ztzVD7P{a$Dda}jhmd~&I3?~W6eoo^zCPigg{F@S*SL(_<%eCI5eYZy3gY@O{TnSZz
zVQA9&OkH%xPr;Wi3S8q(I5Xo>F8z+exyom1@|ZTywnF=2-EY!qDtD&SaiF9+itD<~
zDx#Egjh~Auu1KiJD;I{Ur^&#`1!f@EovY)<<dv$hPk+2C$G+;bEy}mHG%<S0-Sus5
zs?wP0cfuD#G{uI;C$vxn$UnyM7A^vXUNb#pj6=~bxPFUzH;q0<ax@_tvLB~{w}-kn
z?px0X=4TBKB_iRGh@~lFD2+vOl_e~vFb3q6@4@^!`<`&7VN`FBX?-(ONoWWULK4HE
z0kUv4%*PKWPMHX`GE<z#-N)S!kE+5157WDGK{4JbkA`Y}9D9LNPnYt=DB|n((iBt@
zAHz#PMp$OyZGjW_e0w5v+YCWcAtTQ6igLM*w8TdlAd2d)fX#ciemFb@zjC-+GIp<@
zdHE@XTm>L4Fm5O*qJ>`vHo6lJBV$$vGLoL>(}vFZ;wW*Z?wMppa<a-_BxNW9o!%&N
zN#~lbZ*+zRxo;d6><;d~EAaXeR-%q``oTX2L@?(GOmMUFEW{9>AF`TY4=(a(G3Ttp
zFmj4Ln@RjOhSfiKYBkpRbxhgeMo8pu5V<(%+rm1a%*3I%M9axmIMV9%Ac>$!z8meR
zE$j`*on08G)2K>h9w9oA<G;A*qrVw}O#t`V*}DbKF5NO8oDNXPP0;jkzhu}A>S2PF
zU`akuCpoo|hJYZr8S^Do0v?%Cxr^~6Ye%!~&w6givC2nzdGhrwAqCZ-aK|wjIdhz|
z^NG^U-(+c){!I9I(`8+x^v6veQ<CkUcL^g1=*?S}gE#eG!5esH%1V(ZotYs#)P}`H
zl3M;`_mGa(&<x2;&1h+<2dfinYyY##(9`8{<qCg<W*g)@rd(MTE)b4$nh6LX(fBA!
zi9>E08+(6b#3hNpDy{9AfuDe<-1Y<UQ7pRuyl&UG*Zx64n|pg=mX@@s>^AKib?hsS
zXw>lsF#}b4JqMM^9CjOIKC3M*r#rWDLIfH|6OA#6r+Z?W8bs9QNK$vUSZ;ms5~A<e
zdQDkoC(b-U73*FO!uFSewUubH&6dp~g^R;z=GN-x?^DtK5#l^@o$W^-@SV)bFX5HI
z3{rlwhr@b2t!8>aGM$*f0s$2qp6S)W>}V3BAC``1^wyRkAeOMX94T>da1z^|!|2=J
zQ&1E>SNr_+X2C@!5shr!YbF*9)O1{Fb~;Si9C*9n?9;z(%4)vqm4yeXj?tBvwsT3T
zE%KsuSR!?zw(Mvt(N)?KVBG^K2?~dt%(jO#NOE7O)^NtNax-H{lUfdWI1yDWukm8r
zsI+yPr6ho}6VJ^(*-*z03NLV($u1lMOpWWkV3&h;#?wdg#AB0q+?W8rr2G7M_X-yF
zV%6hR>)M^3p8n;_mj^_6Y?hsii+6x>ten?zOixTyad_l-xpX?wh>t=+FhYLW335}$
zs6*>zJj;1lzn!<o5(x9QE;CeV1#ZieWWf|RW{90(I>#hbTGE6V>?fVYAJ~XSL?OB1
zt;B#CCRc=;h+a|y^WrYG>!krsJgJZape$aLl$MH1NPOVqd(NpG_?NSS;T@RDm&D<9
zXZiRM3x#K`(T>F1+gnIPgvn?a{rvnKm#(3a$zy+_F%9rdn3_KaJM15}Rhj}!i}C>R
zke4nDz=Q2yTX1|!!Jz(|ZqLuG&m|0L>Khq%*HQzW-~rB2TW1*?m|pyvk2XdTbt1++
za^w^gA4%%J@*NE9{NjQU8|n5z$J4*Lql3SxNrk`KZXlB2Xssi2D3(I!@+w;}sK$O<
z!uxI;Yt{R~%K2n@2oQWyO0)3|5{gPna+;c+lY*KpR?ah-cAQVYZvPy8zlLhPKYkH2
zg$Q=AZ1HZj3ly{)rzb_WDLeVfrAZDtSb&C$QIYT4%p@Hd4s5heHW|6I)_J4LS3JK8
zh=c9phcS)nM5`~`hjEqxXz+SOdGA|ce0aW288Wf%DGujLCceYK5T3~3rN6%~lFP+r
zv&5E@lS3lnOS?Lpcevb>YkxRtq2Zf9TC6?$^QXw?73|W0@<4z6&qxrY3_*Yh6nQOc
zz;L!VOgbJXHWAq93{cFX?Vu;q@0l0%tOK>#nwRRBkzuTrmU$W55mz-e#1#_{sWPA{
z54JcI1XNxACgn!G^a~z!cC_2Ox(ALl8X`}`$r(TfLJ<)WvEYu8m40<|b4x74eU@``
z<IMEF-zO{eOGLxQ7Be-a1Pk1<!3y}eoT8$l=jdcASR73n4{)FH<0bl1Npc~q)&2%G
z;a?$lw%$687~6NT@YctBGI;zd;Kc=Xl@?FL%#sax=f@ce2o&$;b}ZFcOI?!JrVAJM
zgJ^bn3Dzo}6_56b$mA4>FZcpX0Wr<y>*ohuU0rRtP_>>@X|quEwcYFCdH<$Dtv0Zv
zgt|YHz+-lvijwj@GjnWbCruRp&%>+6mKL2$)cJnx2j-6hN|i>FjVFx<b(V?o$aWFZ
zSSsg6HExpgmtU{Fpb;5JseMHcl-V*gS`u>$CDFmg4+VvoDb8VsnD0w>u&qiA$i<12
zZ>;6EOifLP{P<>{p&k28ZpRB@(hw|KRx-26`SZhRt>q%q8v?Fg1sPsp5fPZ_LK#@y
z$J?{f6xPF-Q;lXvDwj1c_SIFazxMQWblb@$84w2O#}5`Z>*c{e>pf>@XL`NCZ^zD=
z7fC$8rA}k>#;UaRY+x5%7wT50T}qLt>k<QLwKuR{mMc=&^?ptaRDA`lzglbL<f&-z
zHV%eME!3*%kQpl9u;mf-&ABykRM}e6f_x5u=VnI-p$b<FDw`xiHw-STIr_!!#BSZZ
zHVHX-puhjCxpECr)1tdiG2!7z_Vyp!qxsw~DK+XX^``Q~6L~U4a`o2Qy_TKkwIFv~
zulv;p5)yGKsot@%ay4&q57H-`M0Qf@EboBbyE+eXdP)V3=qks)cvjEz)H{E5kF`~2
zVt~GUL|n*z_C&|t8S(t><HI@rP;pSJrG}m`kGiov`m+cse^WJlyqxEDS4$3R^kJW}
zBb`>3BO0|{+$hF{x>WgpRJ+$D^>~8P^i4!&aT(+JS)TT#&0(%1X_&kP1C#ri&5;Cs
zo`WO`pHChN$RYcx-2!yWj^6;vEhx{V;UC5JRi~rD#rl;lH~55zR?M2>>Lp}CxQCJB
zNUE1@jx1vBZb70JfbqClAsv2*Rsf!RahbwfwvrwIm?0;b-o;}*+iQ^%yVR&MRD_ml
z{nX2U!KF{XW8bve07y&w6zD37N*20{35rk!Z-T$;|6ytl`0;y@_!#QKG*OV#UOuO>
z_4IoCyOQeHhA?Q7mXV?L#EYmF?pY$=>q`@lyiD0(5%SmBRfyZOb)HE{Fk(LanegY1
z`qnE*MJV+JXFXp_Cl=AKcT2Y{&m~IWY+bf0k`g2as#RkjL~RwgbxO=5c%Fx>0k8$Z
zk~VzEP+xE5vkBUL;CfN47Ea>kQPy?`inCy)QtjCwPJ$(0u_Gz5WhlCT_eU`wk)D<1
z)l4e8b*RUyT3$np1e!E4)pvmR&Utk)wvhL{&SAQ3wIwlQj$soO-oK_b5yEWL`grvs
zp9CJU7SO{%_MvuDX4q>Kiu)Xx<zBwrfcb*=<>bdFrfQR27ZO-_<3@qSRyn6z_Ls#@
zLX$&1;&JWS{MlW@L;o8uRYlo0eQAslUv^Q$`cE_I462?nxe@IBuoh|?72dyuD;RYM
zfR5C+l1~6-z-zTQ+n2tRynbhn&axa4+juJ^N$ZoJ0qK{a8b8`}ZVtwzhqg7Yy1)Z+
zku$gnr{VjdEx8F>;SQ~LJ#QgH)?HmSZvRR!&kb_9^19Gmwu?VIGmPlVKMj>P4m_2L
z;PSmY8ll!dJbTGvjuCqg(9ZyMks`G=(Oco7e{$vR;jRZDn2B+}f(0xP_vybvytX$U
z_uBF9?t`aTbi54j4)LD?bbCyi-5c2Ql#r+HxJUyfLcE#4_VTd{c%r|C?10h$g9)H3
z(;#SN<_ZQ{4u-S=?=rsA<IPuVz@=GoxHfxx=b~=h15MI0<<ew1iCFH%m5<GYXMNDO
zDQb{t$mo_$+rPE}i2b;X5B|0m8198=-|x+X&h_PY&!--oee)UlSaH?fnQqQ!GcVpE
z6jt-e(eWlZd3Zo+>-Z%78~Xg609CZ`<6>zkkAL&N&DI0<*KL!1L0;(Y-0*iR52s7O
zV-bxD=I1>BB(}49Tqbb4XkU{#i&t`as586}4<~OdPNye$Y1IDfDrtc|{PVGAaRuvn
zZ;7Fr6V?U{VC5|5R?8eLxrv`}*oWl`*Vl3lYyr==|I>_bF*@6kG}~9$Nux5z5#fFJ
zywW<$oQ13hKlxxui`EEz`9`hQPESAqvT3<3?qR-sG04U&s~>liT=v1U|5=&2yWBHS
zVD~#AoEH(*JVpH_h!bQyl9HBt4JdqQ2CJ;)&ebBd2A8bzvgW07jHM|+E4(8(5<i8b
z2hoRBhn?5q<nU$Z6kRbXcqi&Sq>zry<Ig?g1ya$om1hx%H(P~6iu1#i2p-jr4%Brb
zXl*2|!SL6LT#>U5GiFKjr6<nCWhDtHXwxr}_})WMj_rp-7fD!fF4!uY`JWaE#}kGq
z5r`||!EqrM_Yv7tC{=3p@?ila(VH~tonz!0-T^}N`1z`m3f<`0u6~lzlk5HI^kj6U
z8Nk8_cou>{xey-av0{WCfI1I^X*~xp!u{J8?(=@$onuabGHZ@q+q*-A6dqhXxInG>
z3%1ctA+K`E&T=@Q^n_V?q@<<1lo4C4d2xA|f$ZbU2z6yxys%imrg^PGx$5=bX3EEd
ziXd4iL#7XTxEXQ?g09qrE`SkD#wfePMjnynC1s3%y5?8_wv0Qriuh8ld^RU`QS$J7
zq`za$QQ}BC>EFKM5UL;?f7^J_RHz{d2AwAT*>*N{`oCTPLLQG)H}pP>zucW6%ryre
zg#;moJM_|oC8}d(G8&Eu!u%y)1>lG3m9{UWe*si19dCMLuH<(x6M5f()6$c$chcWo
zb3UyL<xnL_72L<V6CGd$=1S+<epq5D_ThN?<(jIp!v7^&$%PnK3`|A=OvVk67Rmfv
zFy-of_fJip)!Z8UVmZ+`;pCVUNxQDn2Skz5BsN%IjKmb7ay`;?oMf_#T16lqvEs20
zBq@}%wfo6XuTliYkf2_^aU9L)%5}JqsMY(JX*jC|NO>1mJ!Z&jAc#PrLFDD1VRqO-
zK9y|S50gpWJl`eNKXVdFbBSERDRV`RJuDFrK!no5q8OES-YiG_$w5qOCZERhPnD=V
zX-?pdTD>5Q93`B@Vffodk)Npq5@Lc3m<i++cVF;OKhLAe0Ikr?c#JSyWRc4pQHosQ
z3HivMFS~KC4uPV^>!UlJgJ@rta#!NeVp%C(DoL4+y$JD=!bFZzDG&=9r|gcpe~y1$
zo~|S7Z`Qu4DN%<+%}IcZ**@Ph$7RnJCA;__Npm5GP$1swYNSj>3z6Ick_ix++4Chh
zYwS|{9L~8KVu*xo#mG?fo=s7$8xjF%A^o<`ZTlcC0lt(ft>63cX<Yi18%K<`?0Cl<
zfjn0vD&`k34YWLdK~6vywQ`g}L;byZ(u++0D8~`qC>M6%4lC+x6A)FZz0PvI9is7T
zifS<3VVjjnW(+2cM`0KR*5Sp4cw5dLAU|9k3|XKuEhm1@lemGJvfCbVkiK1-P#$(^
z6C;IR<~Pp`Dp+h8vRgO<5imk@JWDy*!%q0Z9`Jtb)G#0@1ClJ-&0N`fXeTyF%2v{Y
z|HIP?h|(^g#9Ghx)`Q}CxYI@$F_elyO$2;?mHX}kvm@^QglwopzW%tE*1>LZ+rtQB
zK1s;o500+ZHY?NJPXcXP=+Xw6;6bj}_0CX_7cG83K%`0&(~N43k(eV-meI}dR<H<|
zUZ@1>RAF5uqKQO_>wf7IxC*^SAX1iPT$-rD__&BZySbo4{atbWwiPY_R@Ne3Np&1x
z_!kUbzRR7|`|IB8mk2jGt-m^_Ev|LHL#LLF!)i<yv0?9HZiwIf1Vl`=MkE*Vg-G%M
zsaFGi;j>E`>V}Ivs<z*2L`+md!pPiQ5=eEu+Ikl{|52wM6!K0(xi#k-f-ft!Is5WI
z9!y}rUs{TD7EOGA5(?j~%fjd|MmSu$4c4>9QIZ-aHw5?q_LqkR!AbZh5Dc0E{tINh
zxf>^hLs@yXo`H^hXa+A`3q@)izr4JU<ve$QvUzd4r1CIC6~iblpeY8*QuQ`>R%YYT
z-+fp*q@0||jg1_EZwMqrMEWs023A(m)6&wy2qPmS*)P*J^uhYWxctd22P@=1e-i$o
zb0vD_k;%A{y9`{RPXR`=p6c_JY+hMqZ`8l((wk@#uD<nk!;7^4qgV|BqhR^3=+ELt
zbmd-J*vnly`y4M_EoCio{QjvwDl?DcAw&8EhLpnX@-deDi~jl6kmKIZuXh)_!0a;W
zaoXn6cPdw`2vPGd8z*<sJ6fvv4N{gn)saw42y7ycUUrIh<U@!^uw8+-Ky3mfha-|>
zp${HW{$?;2iGhZlg{Y4?61j1Ocy`HLmLsVh#%!HPg0=WRLU_?^4VNiA;A$OCqx$6v
zjYrNr`ARgpFa&_o`hIqHmXwU_EfJB=<oxF5=3>3o5=KYmj~{VAf6A)YnfDR6EPuAQ
zKS$;-yiEtoQvnPZQhNr3f6tC~JvdZ8QnaYm4C6wN92$#B<6#poEzt&pGkndPe&%fV
z+twQ1!G{M#Z#xW-Fddp35g`5vCuDG68$SVM#2A@1N<5|Q85Jm2)M;XHBky19L2vEu
z8Zco0x#@6S3i=0!Hzhu0TpushdfwT$-|gTjm#YVWBldKEwYgAjF#nVW1>7#^SXerj
zRiaawdUqK_zrKQC4^Y`zLnR6&f#_q%1TdD>m!*I@2z!oGcAsMd1;|FD?S;%xVeFZP
zq6cbVU1=C`ztEO&7vHQuukY32f}il?&U42BZ66@F<R-2FT)%MfcLE6A7JmkEdJLJg
zs!{h>TG$N+BFVV8Qo!k&D^n9RG$cK3y%=}mr|A42;D*)Nl`&Y!<L^1=<TOp<riO`7
z?`!ao9vZU7ESb|a95TMRt_PfnDeHN?Zpa4rYX+uz401G{CpCn8C41diviogYLIBEv
zYUY-u5<F$sYg9D5!zg|dsSbi>^=}o<DsecsU7P=+Z5o$<(Dom#5cTo`(I~<z8vpr+
zwErP&GatdQeEuHy-d7&y3t4QwcZLniIm4x%L8jJ+H@h)*^=X*nTqmd2nM4d~VTnO5
zdHIEC$tE<nyMNzM6NqZ|cYoR0*!a3i+{k@^E=^KF`m-m*XC^}Yeu92rA=OD5)Z~;9
zRA_5!`<{(00eCYsFR#GYpA9x^FuXxSh9Aht0y;Z8@!X}`w4+*D9)wXm9NPJN*0k=K
z%er)|tp#@z2tjf3=jc}(6Z*n8NFug_fq;!I5H)C%Q*$F5PNPKfzsa~5pHFv91C|eK
zA>KJb>%S_oCfHx<lkhkCs=J>6kt$Ozon0gH#`4`YWK=1f4&<jDfB8ld=r(_53qo83
zz{SPjb?*q(A4N0)+!Y6vK3B6Uck4k!9B${NR8&+_DJ&szadG#Dc;{&w@>o|Um$A=C
z7C@Q>u^zWg`AhdCvwcnK>qX}r->1bs;2e<J127MN5(-pu$;)e+mH?h=11S^`0pl8t
zLyxO0ITn(@KUHa8Cz}FDyi6C-_1%Kq-ee|FRpZ2x9wh?f8;ki2<<3OrghgeWh?Eo(
zJUo1vMnmLJ*$eSdKsat~ZO!QToGfz|NT*8#y&(vWilX4=P6f!@9oVaW@1iF*g8&i^
zu4@FFw>1xY9xqg7Wo+yUZQtFoRj&K1>F&PI>nu6h8Id-kc^x!&`LnHmw)xbY8lb>A
zy9LFwB6SiMgTOvV&R6wDjz2y(kWdd~>hggLz+?F{DU!QURGatp^|`vb$|@=vVl9cD
zdf4E#Od%$-G2}e&nd%XQi=b^L9_xZo1B_C-$0S~pLodif7c;eb2~I6P=PFm9C8xq&
z+beU$G{OKPx$&Vx1A?7B3|W9rMxHB1%j5cj7Z`(tJw<&mqfRRv@$DiYk~~Pu??H4X
zGDhm`QM>1-``Wg$NiBZT??3zf8R@V<xzyDm51i;WHR0Ba5xabn#kvcP%2VKp=*z+g
zHQ(<7Mt~Qt+o{lo8FVTf(NDv4C0ip~mZ0pk1Sr6cSG%<hT5olH3sdEMYW^WUl0>K$
z02o!|{fR2?uY}T%7XEG-zkGan$4?-+bFpx=cH6UIo(IP80BUj7xV!+x@=L?N6_ooJ
zA(F`ApIm={u;mRHCYeXWfcPXa@OHjSxYB6#wLuK<9)RhijzMZ}3=kfik)qx}oMqt9
zGOHBIl3&cEti+5_FsA5!%A-&J#uFDwQV|Q}o7Ez0O?;LkKwoY&E9p#n0^MG<i5J?}
z?0nU5wmI;}@3Uq`aeA-1%p(U3P`=~<Lk(qF_Z?ok2KVjsF9;0U5NUH(LH|k^3Gt>)
zoW7wV7^pyxQ?52nbo1VKR3rG|p1t}Jm`Nz0K=IxEeDncKHBqMCL{?o%5}X&mUw2dl
zqRUwWHr9fURTO0Lbv(7kC?WuoISYWfoIw;^@7PO|g_nRy2CyDRcQz8qn~xMsXauOj
zAfE0|4g2wxw}JJ=?HW8O4?f+KiRkJ@N-mtUq_#i;>=Z)h%!_ilQYA2cc8NkcW*^1>
z)QqalWJ-XLXF&wAUmr-5xf;RXMWO9ho}g&rZE&W)W0EN!idUJh`M`+mYX+^JD?fx>
zMkiaW*lK5k<+*Z+KS2X%Oz<I2&-k>l<QVFgy!cJbTw>s`NBciC{%PRmdiiGuMU*2>
zp$rHTDL^>t&e2-PWPBnAJ8S_lfdQ8S(BO>cu)tG_AvUU}b6kP8bewk>kcvAPSjUnS
z*k!El>REtV3(8L`X524V<wyYSZ)U|WQE4z0m@qW_xXA^EfZ|yy#o6JKmMW`rJSo*5
zz<6UF8uyH(u~{_DV55li2A37@O-9A36<S1P|G%lC5RxhkJyORYa-UtB;0xECT3+-7
zC7QHB=O<u8ptu+TBt28J3CI*eB-|YbC&wEQ$iDh~Rs={6Q+eQ>2qSyKxz0BD&Mh+l
zs0rXf`<G<1Qg5IS0*B_dk;^s{iLpVAetGpPoDLQsg<RJq)tqb)$YVnSQl+6FZn+Bg
zatJ2;<1~HYbCaU7l4!{%z7g^gPzRHapxVcjyk|Zd*lL6W(ykm3e+xzxEF5?6R6(u_
z@#+E2A`lK!LSy{`eWDIRmCN;?o;BxA65PxqtUx-JXPQ{gQUX3F04IddaDjJ&k=Ktg
z`S(66703*f1meNu#Gq^z1L<A|@Dh-+MyZmFs;%mI+GUdCUF0?}=V6<`A{!jmqa!TM
zHpRm-!!*499dGmIyIZ~Hx=_85+=|DvS9LiUJx!VjOI>216Y1zuGJuzlPlr$7lIDmH
zYSsVq0w47{+nrhNv5T1<D-H>X9Zql;B88n^Uqj?_E^bQj*Mmf@%>_Ya9H%_X<nSf5
zteHKAq>1d&o&zO*csY#5GBZqjWLb_8;wO>dgrt5dFar?^Far)SgN~E^xfngImyqzM
zuf#+aU&FgAay-U0P>=T7be5X%gnnu1C_toK|3CL$Dpu*HzlO5w@rHwi+Sir{D_}Z!
zXNoTGbriBWG!SG?NJ?duR|2enrmKn%;+76@mduMf7JxJkq&_!*2<$2Qk<}IFxm`^h
zQ2_)7Sv*AnD2$ae<;@c-wsAmI2|!b8P}rruCGTtHVzK%JP(|?0bz<rgDj+xPjOdv8
zzGTnoLNI-V2)OWeAb7RAaqQntU|(FPRVqlI0g!FCGIc(p+QAR&TMTXFk;>alUysp~
z>&F0IGLnrwZ?m7EXdXfCF6#Q0;HVf}n^8wxz^9#rrfQD>JL&V&1%N;BlctcPwB7w2
zUJO>ML0_=Vwd6!LJh5X{#=Jc|r;$*SEwk738=!^Eh<<|{kqre@gANg)w;S71z$9Yq
zRjs1DQYk3shFt1w2*m&?;D42(+L&l}fZFf$**(pQ;v8f&p-tfbP>ln(uNzFIg4bRF
z4-8CB+3GhCF9ADqon~PNIID@68mOkPbuK57T+ht?JfZq4?VA5`8?(Ry#xh)r`~$_-
zKo>%d_<su&g=H*h0fdA?sRD<Fayj5ct&>13(HHXxZ2S(k6R7b2f%+Q!gmPRlN`P&T
z0Lm?6SH>mM4JeUiRi&yZCf=CAc9yOY{@&Yv2X-Y2QfoKlzt}-hQKXKq`Rc#%>}?{d
z{?KgO+!)&7T`vlDeio9k!Y`h1ygsz%^9w?u{$3wU^Jsyp;#en5{U;zfQI?i$_w=%S
zvexZ8CBKP+Bp9VS5kf#6V$1^SP%QvBTS16}oEhK*24$ZrlnE@|Dc?_I0B#}fMnKho
z?XeStY|lE_9y9yffW}XNOzOX3L{ac@|AOE~77zFWPq6N?nrf2%QHa0-rNR|ig$f)G
zO*f*OKsw*smw!H6`3&C2<2TtGGq&S5PCgKjJ3VL>%Ww!R1i(iG^{4;7=V;=iJRmCO
zWxQ+Q!`?sPY)rI5h(y@NXdU&j*iys6>&#*Z|3rr23+!+|i;SFPAIArjCR!g1fc-J{
z7zdc~AcX#P&sWVU#9^RzGnR-}flUg)0Tupj<TJ2dJQ^u6<tx>JYI36)<V$o4@EUVq
ztAoh3<8N@^$zU?r^vHpsmf6TlM=L0XRHkMCEf2CZS<hBpu6ZX}*}HQmWNn|M&)V>T
z)(y&Jzw!PX_@2ejrM4gRKuLuG$UuSbK_ta@Mxzkx0UAOn>FaFSv`drVKvQ6y#B+M`
zhnz_u22-UCp8r1>D81s0X4V)`LI9UI)6?dP;+z=&IuTh0t*(voTlex5c&DYgfLjt7
zlc0MA&kn$q0m=~2^}Vd%G*C+1Gxydt0xH<9ZH(O*l*GgjngQJ*v?G<G=N#_M{|L_>
zAWt?jr2}DwXfF`x2!MHay92>cw1xwbJ$Wtl<diVH0Yaa2$2g%}Y}{oZzW}~G*u`|Y
zB9JF5B9^^${$CjYwHkkbj?{mceh$N=!{%{i;C4Q0+)Dw5-W>VjkM5TzD=o!7NMRqq
zf~&VVio$|=@wcWMa#Y1K-32_o?K8~vS8ZTS+6Q5ZR?9bt1VXy2zqW8Ts&+RCoQkIH
z?^5U^#pm@?f2{zFIm~uWAkR8@>_P!JOsLbOu>+R*1Avv~+HEN~%*KB<!suht(xgC>
zhhWefGLp@6n=FXg7zq21_lK3F<mB((y%X^C<XK!?#L{9n97MIZw?7~{oGZ78BpUbt
zxx9qH()kx$1yJ00Fio7@Hw%I_f@@($Sd4bZOc&|vdFlp*O_|Ud8y^pWO@_=ZfT4rp
zAx`5qZS8Fi?}=B(LBGq~r+oP{pBrc5?tuW1b|6kUd|&c1&{2fU;&P%%{w1Y%YbbXA
zXErpD)+jhQl5T)}Uo&5+*%Tv6<Q12cg#D|*W+H}6k^wj3@$q5;ORI4H5FQS$$j8-f
zaRO=|T>nU}{Rz~0@`aIPaNnS-JT-8E=Mm7y+jJ!lw%aFwOLOI*P7_g$6@{taNx1d~
zUK=+<qj$pFtP-N~ioo^!+%WHXe{k8{<Kk}$$1X~68WGVaUZ{6b?fDF}a!7c757g98
zPEM@i;^LtH1W4q!x2}Ldn9JIwq^B<|DA<YlX0zEJaeH_776)f2j!I#0<l(}PsidTY
z^5e($J-u?k*DVKYaNS-Ee%VQ_GvQzB=~9?`uvr&L=TI1qf|Y@08c@*WzVPhE25zK5
z9*wS1Rx0dcS0e+EK0uxPv~&x4Cje2@J^TaFcJs<32kI8!(A%pju$Z)&U@^aEW=3`C
zcCP*1cro$({JgcZ!?wqn;q&aKUT-<{yUXvegUxYIR=GlRh`!^o8#Ee>ji+-r-R-0n
zmy|SbT#RP$;-R2K-HX~*IbE+n%D|?M<<8K<3n@=-7@>Q4H3P%ppq8V069e&XaVc^@
z|6~Hh1quk|!5*~%50w6mETxN!b)KCk!^~VQgpm@pHfFT%v!CsrL>#U97FLOqr2h04
z4yAD@0G5RK8FYQk|Bkvi1FWl`@y=K>bC6Q0@)=;Te(&8rC|AhkiZEI(FaXy-HlHWQ
z;e6%p-ej)l&5DbPClIfV<rEba{|=6hwmuVyeFl^kFg?UPBiXHh5(#8hiGTGaSIcc{
zbL4BMK`CuJGMyu~8*-UMFdpa}b@xC0yy-7ocM~@Oq<IdU^GQmQ7LNET{rTk5Vzb;4
zU$0NQJr`}|P^3jltvW0R;bAqDjy4sn@$bz|KhSn!$jU%Ze;T9I!|#4r-r)LlHGg+D
zK(s%VKa#19_g}SN%Wa}byAlv^PdPz#O#r3-0x1=Py&tSr%-PCe(A?jV@i)n!6D7jj
zFZv{#ZjaEIpLa0FY<apILt$ey=pFBqTBsHQlY*R;uCP8X6!7$0|AcK9&<ycY9`i6T
zfIqk0GW1TyXCk3Zfeh{U?@z!(1k5gtdxzj_md#qb%40kGnrE!f<0g^k-=`bL)77Vv
zN(EWjPoO>R>(_eTv?os=5lsznc_K)r*2<s1CqqP1H~EC_kQ7mIBV-yMW|Yqv<V#CJ
zB*2>=bYH%pz`rc)y&u1XZ8>Bjo}5Nk3NBs9QtnHD*ercPqUig_aRgCEcOGpaej?2T
z$US%1ky`cn>y<#K(GRR8t_Q`1gM)**_m}%n;Y9oyU_)nh_)W&~wcYp0z-L5MRk1;L
z6*$52>gKDW+Mq>*d4fk!U;l%dnVIEsWAu7Y5Fqe+g5Q#$-{GY8EY_Iwm8UwfS^-u!
zw4a%j`a6ZSHFPJU&XK`Cj2CO05;qfNaY=qeCJjI%3LrPX-EGhhnkUHoJ2)WD#(!E1
z*eSbjcUiUnuv7LPq`0EM@3FIN^4w`ZSr7~GlUMvHYR=t|P3!-s3BXNL0Js-8IXiAN
zXjQ9x4-U*hw;SI}Xy`*Co3<-VDBOrZuZL?p(7pqS5dQFJ!QtVgjEryD*_U|HH*$L&
zsV%~%GKo`q!@Hu9_R&Apbq&r$IY+1H6gcLQD*#m-1OkXE{;53S*ed5NZ7uY;V7`XA
z{5w|rG9a5;mR9DDC&)^xUeaU1iu9}R>REN$uiy|kb`>vpEG8R25UC;@<}fXu+qiPe
z{$G{=%rzNRIjgL-zfav@6<2Yl<ADwgv6B_${Oa863ppC1{s7<tJ<+i!A{K1E0c$gB
z3zOURMcZx5p}4CvD#Rxs)(!LIP8u~t<E+P!;%aV7D)Yv=)G<%QgekdkGo22fL&>t1
zKUX%R%XA|;D)#p5w;4~mQ<+yDJz$aJBaJ4@_6g^I4IyNkvSD4DAjLl;O$w#qv$XPO
z!-A4sjE2(WK(qzOjg$nXayStM_n$98+LL6YOc~8`_I}*_cp+8i#!pZ;^t=o@G&pTb
zuqo05&%6XcVzT^?`2+}+5a3F9uUoisxr;CY@Q5iUc$XC9UA+J))A|&_T1W02TTghm
z&ll9U{I{FSVFmzLfG+(bJhD{8o}3$Y=TD4AsyzeHSSy56M^C>1hG}Wb`lTKx7%*_#
zrt8vEFpVcLyY+pfF0nd<mefn%s?cN!1LgJ!pwcFhe%-g6t@>31umkbi;~iS<IyMA2
zmHHWY1a0gTDR4wo!TLnl45Uwfo|&`q;7-#h1M67*zvhzzB2fD4!2(Vf44DE5p8!7&
zaUkfu1=Z=7aYLqR+uS=CXZ<yXG<*P@O$nct?ipIFe_rSK$6f`*u`U7j5MUnAr$l&w
z2beoOiW%TWh&&Rgw511HSaPE!1#o4>M>_Rjk|p_2Af3Jwt4gfxHuG;YJw6-;j$=q?
z3D`gFd-IMj$NF#QK!eZZRz3d2u5M%#zk#L=h$LechBvy}jcvS8Hf8*yh1^W&(vAke
z7HZSNUQgzI5(z4TYoW^7DpySxh#^cva|pCs4AE35a?LqO(o-6&@LlZxRxm<}F*WAx
zX-fhsQ~WTGz+S{vP(CGkD1dq_O(5gvdPw*L%Ju{5Bw3ri@hO1*=IrS?4N$vAT+Yq@
znQzhoZ8`*T=e>1?q^E&c4+v0l|IBHSXkA!UroOK_zy04{k;KT)_TREiy+QrY<k5TD
zqKawx<5`Qgn+u3xDRGx3aFS=s-<`ux-iA-Uk|~o|PwXj-k8~UxCmKaxo79P<!3sTd
zaWUN9(gt^e)4e$kp6DnmuLk&M=JT?Cgo69>B8g3wPxI|C4(O)>_#5i?7*XUOJ^hp%
zH-9hSN@OVp=R>^?uzJ}LD_Dn!qcI?KOOQnhe10sIgmy!kcO-^jigb?A1*p2T$01!H
zpb7J6jCvy*#Tj%@mO7TfXaGo6qE0ed&a;QM4oP9V<O9NgddIAwT<;E{J!@8)HSqke
zAj_8w-c|rA;3_EGy3o5^18iIjT_@pni1Njlv<EEZ^WP!l-M9K@YfqM9EbB;!AjM?8
z?;KjLn_mSKdW?{L-@7*Pe0d<r!MPsH4c@0K0{K!#89}%fG9AbZxL$$%sY2q4NckJS
zZ>}V)Cnx;niK0JhFo>r6n^4{5@7STXGux@?G`mR9cu-ihTocwk%gszxQbY{G%@WsZ
zpk}~zwuWuz>k%Yzb+!iFZC9dHSxT047yGvX2+I=kh<MKo&Uh7z4=qBWS6uJB(-I@y
z<nZx6;3lTV*CJAX-2uoF9Oh`f-Bm)>c4lt}fe!`=kNs+hyru<!V=`O$avk^Lt?5h+
z7bR^F&~oN}Uh}xe-nH-7{UtDrMh=&J2T_bCN%>l{_(9uGA_xSHy>~gbu8ydzFlfxm
zg;k!V+g`XDY6idk%FTT5{va-{lGDl!ym#SC|Aj}S!VBvV+GMlN7M7-P+tF4CwCt}@
zrO-Fd7LaE0kJ-R}e`s~=x{**1HLC5C<m2Unnu}D4^#$ndlY0<m@#7Ot_qTl(^JiY_
zKzJ_!Knum&pfKGG#I6RBK|x{KLjGL<5=>Ua1HkK!$0@$uS|9wg_db6U&Q3TQY#wM%
z0|Au7tswj@Xm{an5!+Y2f{5JMd7%9U&~B7zV!<N{#fp%2*WI8Llf93NX~VIl3XoWv
zCbd9K1t*-#4H{#BccK;C0>nZ<Id6|kgO9)S{rhO220$Rthl^~PAiXxlc&rd-sw3`Q
zJRPkD@GQ2w{s4BCl_C0a{h1sgHUeH?PKN@&^Z<qqkh<Z=7$sm8D$MUY=SJ}`B9zNF
zmUs)38Z8Tz@Rv!;qL#ISMV#b(L;z-lJIs<VhAh*gg%G+q2Be2};if@BRz2Ue0vXnU
zGg&7vC^nLjR}g*dgOK1|3wMpNy3KvVS$5g4&YlsMb8yo(XigCi-K+aAk-B=+3QaMI
zam|J1l|BE+qJW`*rsv<Qub0He=oLV-0i!%>sECuEPe1R9I^+OZoa<J~CP*KB1$0g1
zkJ{zC6XiGC=rl2XhD>MLE7$dan6#U;z!)O#y|Q(fskYQdIobd94xs!I9irch6F2f|
zxuXdU|F=Gn!^mr_ZuQ=}E2FC)e2|clpfa3(YP}95c+f7uCIoih6o7>V!1X-do>#Yn
zR;O66if>>KYm8NlYb&iSWq3z`T^TfENgZL$l~vLS0Vhz2+rzUdB<z}FfIjWMBH<9j
z%u>^9+Fcp*)_<Eq#-8s5mWzrK2t6NMee?x2fV?~cS)LR6fvqZ>NlK7HB8+pva^p>c
zzPw7pLs5vw+#(#J;2T#g3aTkt)f#d5Kyw4pj<4e0*h7mLHx}HQ1K+ta*G5P(g0yNm
zdcRyd9y6;)0@&Zv=RH=IcYiAC+~CAYw3325bAbsQIa=%0rd<EW6=`d^{He)*I_y|d
zghAzF8C+Nk4SFMnbZjhwfNd0<YDgO{Xs%_uvmhF$+nFaZku4o<YZOpz0op8N8vjwA
zt7V)o<|k5i1rvaA3|y0)Yi|IOEY9cAa;P{>XCeBh!^!ctSm_?XG$K8@yyTzr+5;N~
z2_&H)T9NmOgJ^{e_?iKjy7lSeHuy_hQqbcP9rn)cY6&_{g?%OBXrsA&UNHA1{Dqxa
z&HURO&etBC7k8I@JZwd*CCzksXBY2ouWYBU1h(T9Yn)v$b@{60;4~o32G2J_J~`^m
z3aZ*YGf^?FkFhIhRC3o0EPCmXCEoy?(kB#<IP3F8ld#tKIenw6%R!G3h$zc!{9lRQ
zY_Yl9=mCnaebE7d0VM~kJ7Eum1YrA9{=zsxGs>sZB?xwLVR&T=gZ;U_0piW7V4Y&^
zjV08h!+p97ke&M5;Bq4;7ym%m*)fX5%)o(U!_?#Z-t`hLBIVoLb|~ZGfPy;ZY@A=`
z%TD<MpHC#{tb|w5kDbSssE$pwEsR1&5ZevsUXP;2WVy<Hw@^RN84P|u>N|+U(D4v7
zpoM0bHHb8NT8k;^b#22op}l5joZ)u8I#IOV^Zj`Poi+;L+u#ch_kuUfe=juhFy{W$
z1(eCSei}Skw5go<4T5@+{4l{8(S%ed#R%ayk$r#3c^mz^LSa{Hl^4_%muu-_F1yM(
z(^5%H!)TjN%lB3t;fparemv3f_#r+Kt#v$;iAGMr-n$yDsZNCDPmZ`|rPXMeJ1WqP
ztlWFQJ4fDEXN!`*-JBipvm;%fv&_zJL$hEWX1F<Z8*J0Xv+!kF8h{pXp0L)*WOOKF
z?=QN&M`HUS9Dx%#x<%Q_V}PFb?(I1}+lQUgCfB<Q{j|26jzTZ?hUD+pg2+(1oSCcr
zfgDYW-{v#m`lJ1eU2XmGeo1NFR&HXUUR}c<jhD9W9wJU7Sm>Ib3jMUj;=wFCDAZ;)
zPW{m?f$wfcJvZ=5kT^CgXWX}OB_Mc7%eQ;@P+12yZjp@;^|}jq0p4O`!%<LB_y-1l
zU}wLHyxQO2hl7XzDn%|RCI<J0kjJz-J@|V}%*zN7XcH3?AyLsyAjjWapx&?iySk&=
zNU`F7hL5T_GVlYMHoZQ*@^mYB!+5%_w>ZH6d+W|(uQ_bmiEpe@B()f^i?1RW|JaGh
zeLU&d?Ne@VS7L$gz=k;7x&7x&WVGie)7Ky5qYN%oc25r8gce9M^9`53f})K=`X<&}
z>u0$a8!UXAxc`}>$sOjIjSe+ZA9hRIw6hjqtZyjoJ(J8TKPQdqo!OI-H(uz`AFp)~
z0Ww9k!J0rqLIR>o^E}^f78MuoSM%}n69HdMa9CImX!L)Jg%xCy>4o|F^=pAKKwCtV
zYcy;;-K-6u@IOM;sYeVDdLYr!(Lp<@-W+Y7T5Dx8hEz`II-B|j;h*xe!|RYq546JL
z2IKqMjL#Yj`CxrGKSo<faU$%{v^bt+uRSKvy1iAzfO^kI9V}yG;M~{M^&%?jqU&!(
zXm3Gbzg}J;e&DsVXVL{?f%P2>{Sj5~`qVG&30y)>uh(3H+<`QPstl7id^Z`4OB*L2
z`VJig|BQ7AZ#LQbh>?9rYcCV+-X#kb<BdetQOr83$uL1q?)-YY_>*RReLacA?49$`
z0`ZU6u&@T86G-gKmt$ojJUm3dF5k-bdkNE2(CojwvQllc#)tYAG~2|)#8erN;X?UN
zn)*xRcLS(g<#~6meKn%X@Tpi)pmM%vcJ_)es>Vh=A|3M1glp+Xl8@-MM6xJ*rzWFw
zifqfB8K=~CMR}Nk*E;--k(Y5K_m!(jJX6rIFPGh0&!$#SORJw{IXzd>qAw&^NcErF
zBNypwlWiY0dHIIhg1-&xB)Uh33Mb+3MHVskFix%!pqOeBCrUK)I$zhvHp;GrvO4r{
z^qKN`1S~cLDwBYN)o=28*3^_0{&*Du$M|s;Z~Zf2(%xxgbcTKEJ=0W5rrj|z3Y<rm
zA=BrNiJP5Yp&{&m|8;M#P+UU72heG>WbpX(R1N;Mhcfj#2~!Wi0aaTqu_>ykY;N?0
zVFO2gXjmAd-G)%M@&75~%EO`T+W2HG@>;?$wycviOZI(<35_*sDiQ`sV~G^9Z`q4P
zi0s+3mNioL8oNX(B3rU&>$~UeeZTAau5bRBYp(M=GiRQ2?sMPwIrs1P1NM)rhesRw
ziEn>V(}gwlZoL=CpTo(&!!xHOI;>dQ^QR>@jbOezitFzE7JeDMXN>}j{3X30bn7+7
z2ub1HH6@YUYO)xh`srNOfFULY8HCJR?H7-puo!bCr#86eR#(M36yw4=uJczwb2ulz
z%Jhn-M_B8bASFNL*o?w9$rOrwD?f@-Y=eY^EdqY~P3PK3yp=CIy6<vZZ>&Vr5={!l
zAuIQ7pN=V6L#ogR6f|f(71Ux(vp4;oDxbH8f~$bPRVxaGdOtKonbfuV^k$MiWqRps
zsD`J~1lI!L_n6sPPtOgW&34CAt&V%1G3>!vzn5=}B7Oa+Z2fyWO`_3g$CYUcIxv8a
zdRMyJ>hhU&FWF~rQk|;x%&aP5&rMDh=SI%W1dVncSj@}X)(F4vrtFN1dG$<BFZ=lv
z6?X3?-6!kouT#2v>#j*YeO5ttu9@QmS<7>$KovH-MS0G4zMi8NHu!QH7{?RE4Y6r!
zBgUo!A*-TPRcy*8R?Zr?OJ*DltmPewjvgLsnL1>{vv(Jj9<i)YJ^6U2X<0^AmW9yt
z3s0!`wJv$|3&|<b*gE1M=|$aU$J9gIxe_}}jd+ir*Ey%n@eD=y4CSeHdF9OJu|Nr#
z^6T>$1%q~rWhvX5PFSRxZQ;C*+md@OA4)E)7lnC&aclH$$PLQY0+TS%!WJpCE&TKP
z=wZ>VM^fW6(<gMNA5BGY%5b)noZKlw5i(y^(8-RBzyFBWGVnnkUE(^t?q1<e7~6kC
z-fFe7x*Bv|Y*~)uytVZwm#`ml)i}SmcdB=WH@uY6|58&^Yy5x)zSDdW28(4qaFJw_
zy!yJO`TH3`KC!>(TfVFW9Qr*GO^0_cq~JwahB$7oBi}41bqyU6u|%tIPCG4?O3D?$
z&Cw>p-oGT3+i!`_Q5Krt100&2z5RtM2YY*)u0&x~H6cE}rx*S3_&j4C2m{qP7ZY3Q
zl=ue^*1~rw5GjUKmX?-LJa5=+qov-k)pMw5j~alS6Sz%nI6&j$<16==#SRV*9#>Q2
zMwHuiGHvhd(A?rYYiOtg=&hRTV?hAR`w9#ikXg~Svf>W7nBN2dIUCR%VFQMmZX9>a
z*~fCERrz8zqw<5B^q-y<sbRSa%gM3E#Ker2+mO%C&u0!@b6XshqM-PwWd9u0n)rxi
z$4Wt}MH7cZ4VPKRjad70W$VVY9_;Tja&t=z7F#u<<O!3n4Nf+O<mCwg1Cn1x<`@m7
z(TL2SK9<$3yspL%EI>-_(FV+~?RAsCFUSdeY%<dI@Q_K=&#kVP$@zJ7>i9^1KN-T}
zMa9v-hrxEs%)o$_{_qc(3|`LZ9=uHHr59wOT5fM=M3%$HKYd(T%-Nq>TtYe!niYI@
z+sDRFI^&IW9G#p3b+aq##G*FTN#E1qW8B-@EB-*;G+wy><$rIte<u@69vnVS8g%Fq
zpAWgrpDD2Q(oX{yeRT<2{>ng%m0w8-#dPF}-*v)2J@rlAb=%yGs^r<(&XfM{*S8b@
zJlh+#N;8$?cK@_1x07rA8H@jG_8O@ne2Q&p(NIuS<UMtY8IDSamznCR3kwTLJV<65
z%7(_qiYu?k_SOfCSY>sy8AUCM1Co=Gmo8menSNmOWp$>TRTf^4<O$r+_IYL|G)dI5
z)lQmjI;-3#F*TKjnwlCc;fPDx{`+$mFGkHi@#oIS$cXQr@m_gF4TtXn*U@}AIXN2H
zL||636Ny9xpUvWSwbay91|%F4njXnDjf7IzDS3JN^{jHWxy8lTI~xn6Tn3U93Q9^F
zP&AT!+^a?BpnfV%<u=pWw_Z3`4+ORv5W(y!8k^lEL`AjD&6(5E)29{|q5xGF@Z+)~
zhQan0uef*&Cl{B~Na<yB6BEtI$jIq0U$g+*IlPRjH>h-Ymj~)&#z4PIdic;30A;^+
zcg>ugi;R!)@EA2UH6<n`od!79DUdizm9!!A^zve2G;wf1J5}RSrEIA{mCek;g4O91
zN*{wMAGWDb6A=^3N#NJ(-~RbkSW3#oadt6059$$S8XAa35Kc3P)AXdIq{Q^}vmj1Z
zX<T`G4R3maIWZ#xYj1DQ%)wy*-oj3jEd=m*-{LKbazQGNSZJ1yDH`hk_`;*a4O5&v
zd1pa<b$vZ2TQAPk-d>Okq~5~lICRHrT#j8+*v*;(@MqPH2_=0aBQ)aX%4f}}rTv6T
zt1@(w{%KmIM9%Z)(xgkFTACz8W&jtu-UsaH)=XA7_fx1+Mb%W;zU}`|?OZa*Km2Ka
zVc`g3W_p@|(FB%}qLPv+4wq_dcJZPCUqVFF!HzTE%Z)sKrpSVt8jFIlS^`g}opk@g
z2DuNS_#F<1b6lSzl7<kNm#qQul{_y=SWs{nyKPJO%9O!tvM!Si7aW}l1az+H8J@e_
zyo`+J%*`1PJHYb`P2Jn`CDWMgORx7&ci4ggWEf;Hc;Y6Q>A>1Mbt9d$edCSYLpJ_1
z4Su^`rXop$lamsIIeB^ab+dUZ$9$XVqbB&(>=QRqI_<PT^AO*gYEM*i9u`Ye^6b)Y
zyt;q3Y=j)4qN0L8fUw-$++2Cb<!qux(Gbmx+1Xj`q9Ltp;$c}ZkuaiTl>%kVSYERA
zU3MucDerZnp7;mF-K>R<1pdRcv~5ssASVzAhG`0n^<GO%HI^0@TKNMP>`p%DWME;j
zPiQnP)baMNGhSP1qt`ApU-b#11D!rpERT9D6fwyW*e7Z+4Gk(JRXv1)j}OsW5&i(b
zhzJb`EEc4w`pAJ-3WC~Tpdy>sh`{hNg2VUUps#OFEHy369Qa;5C_4Nk;xKr0lJHd(
zP8^|;KcFd+)H*RS@pQBH#lp>(H{Q{L)*mL<^zMz(I$%UHv#?CP<?&yviGi7suxg0l
zQuaEosK{Q{xQU{nrL6!;Z-<?<*oEgh9i{mryHKJ))IQ(Z++41Khb8n)hLv2Xq`K3{
z4N%|E^J2P|mPp`OMx2gfJ%n%|pm#(a(y`&N7QDvaF1Z8-2JTOI4#$ypa;d?=!R^hF
z1s6Fu(x9m9`mKk3dS+(dY3YZ@kMD{t@94(8zV{2lg!6%$$6~RKcR3x@w^MrYA)%q9
zaDxbd8u=I4xPYZogM1g?;QrWxD%=8$WFDWb<#upDvDDn&h}_&<jJ|%5ZuTpnQHLt^
zO{%Ty^{Fm>jpOdVPWwc<Yp18XoAzo&-FU74#>U1(lg-uOclTICU8g%3(-hpufNQN)
zk1GiS^D#4faC)EMVOcaZk%2sW_AFS-=h-kiQ4tX>KR;D1ZEfBJ%<%*a1lwS!N7FM!
zQUJK9lIN+fuP*}=(^VoXCuekODhg5SPU-q~#0T~vMV4h=<6G6vBMLqon_Z%1SC&3h
zpRaS}1tG6+@P&}Q(twBl#~ilSxs9!@=`(j1`72dnwfYGq=`*5G_lk?fqBh~pFh#l@
zEPCr+M0|XF0FX<4*LvkAus=Z53?Sjzr<XlFJ)!W7bdQUO&;Vx#K{zC}r^gVc7q*Ij
z^#qoyJq7@9S67!Y4kxll8t0#1+FL(?jXTW`7EU}&R`fqm0eU<F0TXFlG{idg*iZ=2
zw&(52MixxYpFc6k%*?D&HnIk$?d<FvjIsMAWh}-@+l_VI+{|irkBx|E0j{|CLsVB!
z4@mKS_Yx7V)EusRK*zCn<D_zh&(@7+1qGQoIRfr(ZdpJ!cD@xekxLR3NYoef+g+-K
z1RlLtRb72V76`n6gx3QYg88LOhH)yZm^cIao;1Y=kk|_Y*cT$P_xC`}x!POof?pQH
zpS8AX-Ny)2xqa1r={iN`w>jn@klgL`W3o}J_gnAdGlk}+P=^DbOniDe>P~~;KFM>d
zt3LtOyAue-HLM53kVwV9AQ|e$0wg*fA|pU1a0OF~;b9LSvWf@`XWhTgM5b}E_C_vz
zuqiA*=ztiT0=O1&TvY^&=?Q-R;7gY;-yVz945AKMEcJt^$jrc?VOp37a0US}qZTSO
zLiGR)TCd>6?KL4m0fBf>yHr$G9sr+Tn3wnFsyak0=3~dS`}+E(r=~Ok_RLzFg%57~
zK^+1?cLNIH5G({!40T~G=B6v(!d;ht+`ImUgAk7^5rXD~gamU73%tp|=qM%%c~S_x
zKTT*5>q#p^FlO@GY{hX$smKL|gzf{e0QBNajExy!-y9pQn{C`SMo)*vrp^7__k)tP
z&Y-{I6Ge|OP!+{Wr$mDb&&|nMxqmD;gq)~ew*7BDg10>2VQIBNDt&$W4uowHXHH{v
JO0}*8{Rd6(%o6|r

literal 0
HcmV?d00001

diff --git a/docs/imgs/warmup_linear_schedule.png b/docs/imgs/warmup_linear_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e1af31025fafbd9c6b7c74ad6c2948ca2d3ff77
GIT binary patch
literal 16775
zcmdUXWmuMLx9tlG(jf>)r$~o@gfvnDQYwNpQqs~XlF}^*f`AgDgwiPuDxK1$bSjd<
znGddSe|w*^&$;&B^IozR!W;K<Kl7e*%rVCFnYx++J`N=g3WdT~Qk2s~q0R-u_bphM
z@Za{6&Cl?^^N(eew6Wmfjb#=Jf5&!Ey#E-5A~Z(6(DEd6t>A|u&Np?Owd~EE-He?c
zqHK+w9c=8KZLCb_T^~9<wz9XQzsAkW&Cf}1>Fn$v%ER;TCve+4S?~xy5qpn9(W8{)
zq_y4C)+ar@wMJe_oG2vOdOT;L`clJOUYy*;(YB_<z^v+T!>mf5;woyKV<mHNlH*^R
za&fH1DsI`V|2+C-$`?u`maJZ~dMYI4$wsD0$v*Km^OJEl9*N{Hr|IrJAPR1IVtc>;
zr*GY{wYQJ6NX5vPKTBypse_1Qo`jw~j@6#zib=_xJ<=PW#+?1I<8}!>Gb8fY(c;8<
z_V<aTxH4xyVw7zC|Ke#Fp4I1^AE&9YzJ2?a=<;RkAeq_a4I*NAObRfZf03FRo17f_
z1QlhB&rB9*Q26#O4jUU=`UWQQ42_0@ygVr<rw_R`vjaLro>d(kil~0VTiV(QAzre`
zW;y>{a~0Fn)LgL8rxfPII<nl`m~>x$^ITn1^XbZpwfFvPh)$_x-}QR)kwSf)DkqjG
z8nNJ(7P;d!3(4l4?_Vaz-sx2k@YMaf__m;c&-d89cIPukGM}k@#*vHT3rlV7f3|yv
z?ZHLCEn$X=Xk9%$KC6D_+TUZ3e5~#HQyki9+>F{HE(<#^5vBVcTHV-gA#p$bv-e#{
zufnc1>=MV;&x}**5CWQ}h|AZCjT+Gto<AoXe0a74#GRxvShLG&?o>fC+wYkIZ;S7e
zN*pYZqzOC9NR)dYI8lo@oj>}$eh#&;us|;^PO~~v^n5&mMvN?+j7!GX*Ei$Pq+&et
zZb9GRU<>hCT<l<bx6$07W}v2r*EQFA{G5pD%8;bHKtL^Z?;g3Rn3#s9W`98!{6bwv
z$Gz*PC^f<8x@!d?rFR#NzRu51zS<A-0ysTFuj)Rw59psoTD9E+Sz#$_YwK47%M(${
z&Z5q4ZrPccXfjy8e*Jn-rmCviCD@aikr8c?;o~ZGsN%DCNWNHH)ywHe`&yfxODWPO
zo9r7V7yqtqUh~r%Bhtv5W=R-H3qL7*aNf(&wNNJ0;b@a%+<Rc|+fczh)9;@XcNWP>
zk}bCc+?;2l4HMB&InfcV->S(5N5vfSYKcQWQ<WHxcw8m@#8s`hKvCA=bm^I+1_SMe
z2of~0Vq}5mm?b3YM-&b9^=Z)0og*e8Imd85LHYZg@dqN-?jniG7>UY)^gcQD9&Uaj
zFK8wy_&BNUPwA;}FdQ0(qyAjs2vfaFn)04kuyoE$^_tqSz&zp6>$yH6l3=)$`<Pr#
z5qye$!TciO$dSAehB1rk?CNUG&G21uwSjH#`1(~&Jw_mrmx<{d1_p*st$WEt&Z}1_
zTyk#e!FtVUk!r84BOX=MlXmkcDXAygy>Bd=W3up@>n``V<-2szTowAJZc}d3&>0pl
z=$g!B%d0oGm3xjHg{r=K=|d!6hmd8AZA4>UCDwuWS;7eL$1tWVD=nQ_`}T2X@k%^)
z2q9hY-GZCe*6eqS3?e^&R<8Z?dmQyzu5ZM^3!md&VN0~{FQ&Ee3c^$&+a|BgI@Szi
z({-wy+4!$ri%m@pkEFVuRa;v-Vh|+Y?CqV0Q|5ZXS%aN3EzZ5m)iIW#^05i}Mp`lo
zWof-Bqm**#<`t?i6|#i8LYc!GpK=x6ZY{^Mqn?NgVBN=zyb+e1^jbb9uIe_jRsjbc
zqThY~=PvMCbW<Pv3}4-+^;jUbnX0`!G&E#)bb&Hb-e$ZUU-HlL?d`+WA|^gQ;-dOP
z_Km7V+N!QNmRm+8I>oN*y4v^dF>!I>kC)p97!=Xf;$}-c7R3-+lemD|mrf^sEA=qL
zuz&!+X@XDmD-DzC1-s3)#!MWlFjXPa6fR+%;n1Al+-t%bAvCNjC{(7j^6S;?SldwP
za*c4d{5~w~m7@kFWU;?&D+x27&(m@R<j%Nr`TqKiCifP7`};~&D9&;NX<s2NYZP4a
zC}$)iS?YV(3a3XTueFU#oJt?zXOS&?;tyu3@UuX$Pd8gkG&Vo&{jweMaBJdYJrT=8
zA<hgz^a8So%e{VImPv)RMxHXpW85^idSUf`a&}lg6V=wDprn+8$@L+EvgIa!X95IT
zEnSA=1wvdIo_4yVM#oP!t#$Tmvc>!;uZ$Phmh=>{7}Skz+kN9(1;Z?ZzV#0myyO$2
z{)TB)5Kf&<)=7CK<szJoz1eFUGqaR3SrYE1&r4mmHhwWjud(K-UyY$82_8)5op{c+
zE_UByQ!4iU*a{MXT4h~u44MXe!`@$DN1?Vg7&H|l3#6z;B9wBFJ#xlYz^bF37QgIn
zVV%1D@^lP82z|7~iA)|Ze9QTH;oJQ8xEDUZ_8|&axtm~MDN`}<r(H~x69R+l1HP^q
zRT)JFMUd3l+WLsjwux9K%f7^T&ja<T*0GvF4-fqbJiQ`nXm+^dIZH~?_yYR2Q5Alq
zt^QSEgYc*GWvC~`MlAflDDxEstocNO{;ujI5QCxbdY+widwU268Y;Sra&~s=K}N3;
zOHO1&YiVo=ikl^cEaf$N$*4_GT{e|hw}4Sk3MNW3Zu#|srP<{-8ZCGMIfet;NYS;l
zoK4W+X`lV&zk6IxGFo|b1D1Yl<x!YhGf<SK<_}wRRp{0uqHxu_gMu}zDVUXUDjy^J
zu}qSr!;NFv33rp>O^z}Cee?VW2B(5*$t4uRv#A>V1hl5Kr|w}hsGD2G=O#lWU9C}$
zTizOVs1p8a9>)e54Id>jPe7}7nWZ{da)U<}UX|@aoGdQdJurFM0VfDOc+`Qyq}j27
zqqB5)ih(@j<`oVkXL-wVbYCwuxmA*2_Tm*mEN%<L0n<FA4m|_{u?Kb(k*7$#7eB9P
zwI}bN>xQo~Tm4UK?JsV0D*kqU9vs+Ku13u}zF?K3z~RN3V$FRer8*}i>Xy;jiuCD_
z6-5zU)Nc(5VejjmlX@z8-c42cm$O@N#8R=*gpgXuc$?CI#kgpVV_vNW!CMh+<AC??
zamS-aOlmkf1#aj(bFP!0S~0m8uY&AxFdkW;+`T1792;j_!R_W{`STZ?oL6<5o}P=n
z&QIo1$UM+($EwVcy1<sxYR>VZNRc1^E~b%Ym@e(juf30mI+?YQsuDf(_eP!URb&t8
zoODae1)a}w*nJ<WzU>oFH8f8ud}n}@JoKhH$4XJ0nJ71(vOz$yRWGB^)3dNSiRh^e
z2)SJ?_ncV9=nuD*v%~#D|JE5*P0Xvj+<Z9m7qhoI1kME#K|WSrV<$jDT5sH;v_C34
z(Y1E>&1Dp?iB9GCUGmNsygde3NGXb*8{qDH_&fETux4NA%qP?h{;rwjjeY7M88lNH
z;fRviO37Qw{O_$+kn?t-q{FN=f6P3sz`nKW@NVrrW|ovI1}gl5%Gsez!F*>qAWyQG
zp1JyKu>K+nwP()!_}p*li$w&DdskTIyb8By^&YVjBvi4c+!b`0$!@;0zvOTUB}GLW
z!5Ppd*d>5zO2U|8ri`50bCtopUmSg1PDSpUwyzhh`OBbrZ=aes6H#p}a}z(g8^X~!
zXmc~R^Qr9jNC#)J$+5r-9I?i|jN`AQ%lR-#;)l*M+m6X!u)*x~ZAn+GlAbHQb%y9-
zAx_ILjgn*uWo3RB=e-%rha_Ptka#(T)y-ueD_nQYC~V&t*hRf&9_VE2>)=<$FbOtj
z-hG8+lY5QCklx&GA1XXF6*I5f$EbdZPIoFy7_XTnKFP9@O`H3vlk(`Hw4jhc=`S^Y
znhbYHZ=3;`d}qs{{IDM(=DwdKLi|M4OJ&IwXMrHvm~ZJ}0vb-lZEaKUEXwE~ml*<|
z<Y(S72+*-9&uSWr>i>KN>elR@;R_TB4#&cK?hg^V%r`?k_mv?mLow<8rnf4ONZXgm
zznc~<SJxWpd}AW-XNP!H-x}<Bn>zN!he#h|Zh0)pz&0*RcCx%0;6&P4K+!h+N8<nf
z<Ie`2#a0GJE}sws&UkuqH{&P5QC%?&o20O2#`sGL1#3%=)u-$c*n~;B$>xw4Q;9Gh
zq8hFvi#Uhmj@*bi!_}pg#vd}*P(}yuW;rqc1YWs8@w2VnbZXxnNnMRjP~!Rp2AVHL
z7MHrt*|%G7RhSB*o?zXfhXh$Am6rT2m_6ofWebx!L<S))tHZNzo*T%xxCpjI(?#Di
z@W8lc{<YG2{M$zhQvl3$QPYm?H2Z5Gdr+uoiNmWWTOpE_KH{vb7Yu!N@0_}&7q0ZM
zkQ-U8%V|1a-AIms*p3MaEI{Wo9pwE=!J<m;>b&>rmrOjz>$b3qAMv}Q?AAo}qxH$-
zRnEO(9<Uj?Y9e}e{5x>Bm)}H}+f4#ozH2mD?b7k(OLH(j6#|#3C4H&$^YiP*{I(h|
zEUm32NLJ`@CJR_y+^E@<sBo>A_7$7*T;)3WIWsfkUxVc|Fi-0KxT18ZMY4yNF_bDb
zKAc_snMwdz3MSWyZTYV{r>rsV+k9L?dVRh9Y(=+P$jO^+v$${Z56Mt<XIMZIp&Y9W
zE{=9`a*9b$kJ?=w`O@ANMH7BAoQy$I@>I|8^fO1%jO59@q`JO-<d%a~bd3U?4{2m%
z<o;52v|XiPnxW79$&Sk<TE9J_tmdccmDb!%PY+J8b|Z<<7pZjLyyt!}uK%F8Y|r{S
zJ2(1$`FnBs(&2Jv6>t|Dxz8C-j$VEINW#LxqElv_IKJ)i^G9ZJaq;cN(NZfCTK5^W
zDvDam#qO7(;o-8*&I0CLFM<}+y$Bo~9sdlwRf{s57pfG>GE918#HToNGgIBDr5G!M
zg3SNIhc}ne??kRYFR$jk@0e3~^TZv%FN*#?3HxonE`Dpi8%cj7<Gwe}MnG^dCV+dX
z+<_{Zk!9z+{QQIA5%NoyW|#W1VHxq`+ZecHGxbLs0KY;y&6=+D+z6<8yqNX+^?CN8
zq3d0)!qg7J+_u%W7320ZVR>6=7;h3)v9NJIF;#iqb_?;AGI732rl!`^)Tx9*6>xO^
znj;;^PE@Tlden}g$08xDjg1P8jEPg>QM>l=ZwVA43mlpK?6wm`<m2h)H;|*Mc<Yv|
zt84lAHp{J~j?<S(Nx@&gsx0k#djI}rQF*jcQ{-8AeC&PWXzk<mUa#s2s+w*z)JBhr
zv`WiRxP4Y?CO%=?@%DD|rZT^+qw~B`ifX!vXJr;KKG)0r)jrDO-Mn9Iny=Z3E@R?<
zz=Jb|eU@Re(Au6B6PlQsW@@v0MPd#Q4ee42)@thMQI*?#ds_Br#CNqJJ|+fLXAphO
zqWi(eu9cyJ5~uIV2ajteqY@Gl?8UF^JH3004S0iI3hmgk%>j1fST2yA=$Zmy^Vh!T
zaVib<z04N|5)uIYiQOJDU1VeH)OO+P94s5U>G>9O_U|%$TUf&-um}j`<>fJPaBzy<
zw@nV*rKQoxxF76C7X2B>I5{8E>>onsyRUaj6W;nXEDVR4`GMEbxkP1`uI1N**U3i+
zrbPIN*~WMY_kxcT+11%@lmFSKepS={EMBn7$se*~Q~%hrKZSL~Xl$JV>tW1V_-99T
z=?d?|(zB!b=GODM$m>&5c6QuCb`zo3r@cwqqi7%?+CMz}5|fZH$l>S=eX*jla#IqQ
z-b7SIX{k}y3&BbV))?5LkKZDoVWRy0{kyQ^+<8VOrlUcgE4=y{IFG2z8)`|!)%+Wd
z%|!d-2Gch#_T9l|a44+5V-TWa_95#-iCf&VrGDT8dPtLs>erCM_!k>100Gb$$5O1r
zsZ-<11s#1XgoW*RxpdKUbfwQ<z7P}5)JIP)59F4(Z5r52*Vm04Bq*~e-@3K^d)z*|
zq=Xn78$0=$1p#EDu|<=Z!s*jBk!Ywn5gH|&m{y8fS4k-@ll#Xy61(>%n(2NUjWQVt
zH7@xU_R)n)Sf9h^xG&M{MHYVI2)+5`Qr^Fyy&mmv0a+G%^oTX5Zi~>_S!BDL@G-BC
z_Pg8~IcF_SE&|e2+a1yY(>tmT2HRBPLNY-%*_da*oo^et-C3E@2rgj?T)2LsZYi(u
zoWx2=ql4=5Jn$%!rO8;A>~4p*uUl!xSv^)DCrG-U)nO)G()k0buaTY=etBYboQf=Z
zZ2k!SI7I-wGf7+)F*U`O>M0%|lmlL}zy<O#(VFew64y{OB`Hk8K~dhCW{O1Sk+i}0
z0e3}lvvrn^a;jbv*EuWS^}=_ec_3fDjHBeb6pG2!zuSj}`y&GrGN)f-!{Gw?iT4b;
zjjg`oP_VK8BY~ifk)ZQz--}bim2z0$rfQg#1DY|XdiVb2{z~m7!<P-5odUoR1PFvB
zKkTI!{o7U4hdqL(V)33|%?2$i5)I@w>-$4Z+d3~EK~oVLd>)pB@we$DO?j7Ry|Og4
zL8i|Py^;qhp8?IB|I|g|Sjn?i2aDmLwnw?Z5&HV%Eu)F&DRZ!E_?Ka`nC07U&K-$B
z-NevK1=O~<qUZRw{Bka|G2Qpz_iu&PG~@C$aAj2}a*as<F|)x|4FnRoH<uCrVH&{0
z!q)VxRFIR^`)~yh5FclH93INj(sAa=LH}Ly_JXym73G-|W5Iib1}GF>xHiLh89CAs
z@NLIpoq_1sd;g$S`|=}a@uWZXSkQ@*7QVn$6&{fxo{abr#hs9#KejTbc#}lbE6vU}
zO9nmG<G#NzgHjg)cdNPo0bd!v+X^3}o?Q5#rGs~IH=rYi{%bu8!AJXN=jd5dX!9)-
za*|)XzsmNpajGw}=l1aI&v+NqlbBa16{WrowCh@$XOK`pdy9wv;9U`qKo(@{&nOIE
z;mIY~s)N@t^ej%n?erFvr4<%#(Io8!>pPS!CLb@AmvgU)**DhzBtuEP0!ZI3_!MrT
z3=P>r=NEdQEcMd(SH26U`PxeQ?$JveRa1uNGS4<7sr+_!3q6h-U}H%!co2Y`=R-&?
zK{Cqxb;7Lz<9R{cVMrbNi^VaZ{U-b}lz3m4Z}Iy}2b`Ir+uff=&!70o3Xozk09oV9
z1~|3#;BeSr{TH9HlVh@NuS^fJI}K;hi!FO0ern<|ts2_b`iB(Z(pe00nd<a6se(yu
zDK^v~+peZUEAhTO`)hwN|1GMMA~|LLNVr#D@)C1Xrm{?t+N0q&g9Xc|VXYG^6^_|!
zS_C}2Jr$k9=q48dCs=Nz(bpp`zcY-9oU(6d{v=nKhB8p4j#NATVM3frKmu}tcP{<w
zR{ac-xkf#yrlS0;ZE#i>l7DESmbN0%kBN&RBGBH_x<Xr86Q`0oR<jiy@3IdtTtGna
z8L$$NAlULL12G>@YXEy@-~2SjTe%>c%6^mM9EvkTB<)dDsJE12%tft`cB8k@2o|(I
zo4@1gfo}rosjZxwH2veNtqA~K*qzo@QeOwplID`N@Wa6`j>40ZIdxZ#S9scuBk<d!
z5<k9Lpk!jqSailot4lUp8QaKe^W9}0pdKL|un8b@L@wwr(iWCcl*on;QuXRShw>FL
z6%LnXlq4p4^~qA7S$>tV#QCj(8h;|L+{a5f!FSW%S+#Y(Fh6b-4F5z#a>d>y)-*yo
zOeo<aPH1SyCUi9!+rS!W3%bxl0<0N`zcER}xD)dwUS+cjfWYbtF?E{{o#Xmxc_qy@
zRsrAogC%L;=PnxTW>lszw!KJl({Ln47RQ*;4!iS|2%XARDLdda!9q%rsF^u*DXy}E
zXhMV!%}q@Og%T)pn}wo3wKBJg8<|%ohEP`Cp09D9K(qX?w<9~$>g=uPTHvjky0vzN
z?~F%m8L`lI_jy%|$qBUN%LedTs8Iiih>cSj7b{9PtDcvmaZRg)mlsCMmos4Am&ea}
z&HJ}D;mYh_+pmcNJLRj!9)JGXQgsj}N30m5{8R;UqfGC(LVI-8NmbC+db8=d1=s%0
zy96^z;F4tGbYGZT57NsP1&Jm!_`y!q@WL(Q^T!s}wrieq3BuB3FFvw(9P^4`7ZPJf
zyYNeNixu71=zTV_(7fr^G{29OL5p}69T%qpbjxMI&~P(|>AuJrga*Q5@Td!abb(w=
z&4f=;?i?e%l)v5_ljALK=A<8#^np@AWi&^sWFmQOHdf8gti{x{$4jbu@ld>N!A#@y
zwr~Rn8SghEhmBIVQuIU=VbbNZi5VCeIyyQ&rHgwNeslf#qf7lNX`O_q=)_)Ib^2pX
z`^8ngt-X2ZO8inpCYtLQ82vK;vQJ0+T3d5<@x?!<7hf$c>un$I4QwkG(wdTFy;4U(
z0NIGE`=4(1DDcj=^2DT`iln=55}oW7@Ta}C3LfZG5|xu%xKGWnmi=C@!UTACoeDde
zot>Rx<7P}wonlmbG@XLA^~xPXl1rC@v~ty|XmGKs1bmj{yOlPO3?YN6h<es+ssB2+
zMpn+3lY1qMd7vB;u-^ASL;%SGDZMU8h-e1gexQ`oXDr<@cq=>PzC2eOtNdACj(KT>
z=(mQN8bSZSK#A`k&x6s?64MgPKE{Q`#Xai=kDuo6bV|^nN%-U1U9i1)Fa#`b@<|i<
z(wgI3&zHxs4|_eUZ$T;<ocb0Zdr7DQDZ9CPslaUYYw_5+@UaNK7+)faHQZT*hO&*g
zNdg)nIVC0RyYF?syLwO6dR*n?to^a7tE>BJWVLT#AfPjzb^cz%;iVjv6eq34*SDq)
z`y;w(S^KrJ0SLH51j|id70`KR#bm-~E60gq>-=T;F#Z8OS<1U&H)qq$Z<uT1<0vU*
z{+G$r|Dx~lDCL6A;`xvew@Jd%(o*X7Z9-?!m*b8v)k0ppcu`Pq{xz9zwASO=6_sQZ
z;N8=c%~N>;1KKERk*6Bj@{_$HXrE>sKFwAp*jPYcF!#Ck9sri9gSU97V(($BU;(1o
z5o}FNBjvZ4TAG6HHp5sD&AF>DG|WDHrgP~cNJ#jmu4&%vnHB&c9T^>mBR6t&u2?ZK
zmnR4SAO&vIs>8!RhQLa527Z?nZIzdzcTR5b@DMI9FYixo`i9(to;A1|8+CAS;0`^4
z{p%x%?{!&DpfQ-X6gx}^-Q8Q~p=YF=Lq{j2AfXH5+t$zl48>JZ-U-y+*SiV##~f<M
zqUIq>6GY=b4{x?XdbU0n6Y0?lB_$>5?1^>q8!Ia-o{?Mnx1^*{A3l6=`0+I*Rdn%x
z>jCNw!fr3Exta_ZkemT7BBD<h=oojzfyg^dc33iPa`emj<yAjCH7nHmOWXDuaCmb-
z<&~4(L!mAXGC27sn+}?oJvfgm2N>WAS)dOQHFZQp1Rg0VsnPaa>(2Jmr|6syDxqN{
zl9}0>ZBFJjZc=@z^+r?r&CcesG#y=`-1|v1{RAA`TxL%s5(4DOFFmaR2B}b~f3yBC
z@275))UOf^>~?kjIe0}=d&nDREmfVE8=lFwgC-ATNYT$w12-bM{ddv<^_S;77f_sf
z<#<X;N=~9nAA2Oi^R;qK2J<wE$JgC=hweG<ZyDE~9IpCEu<rhF#<3aYvXwgxxWT0D
zuQJWVczTJ3EMmNE*<oDR%3Mj$;yOX%_0QR%7rxjQm$?`pZR^@d8sMU^2=6lL7fF{F
zD5fPeA&JNj+Oy~+&im-l^FTM2Rb5SKWo0#5ZcFvcQylfGww4+V?fI0Wt&L4f!||7f
zS2;PD)@77hIVz1ClQpp~UrsJUhRVoz*9>A4!u1;w>{&B2RzJ{jpx=>iW3+#rU^ez`
zy`_&Gy7mIaQV1Pjjn_n&SW+-K&hhfy=Tf~UXdRnr0XY`%r$nGZ_w;YeDmNbnf`o5d
zqwi*Q>Wl#O%gQa>Ipd8BnYBhlTMo@b^iok6yh04hQ!9*L@^0D$`tjVM&uUz>@2O00
z0o~wWfba|2$4h(lC4`NiQEML71kM@^&=(QM-XoED^|k-UUHT-wanCX;2~KJeJEa_B
z+9fXH0}?;awp{_Bjf=M1xUJS_=Gc@tF`tDV`paZKVGyDQ)_5X=g(YQJz~xoP8sJQm
zI4d8H?vaYQI!0*uwoL&1D)ZCuok8-cFebPd!zpnQy^)A<-x`q0icP<B0<ka)d?E2c
z5QGh>VXWEMytAKkZ}0`CFRvVJU5Eopuw$W)W~I*Y^i;}1E7Zwh<IEE7Yw<3sm7chd
zK8ygj&8=;Di{@UORY7y2nRPidFO8Wy@;Z8A+k2*E`$r|WlyWX3VgV)8dDvkD`ereN
zVKCr(2&&TTZ>h5Y8Ma1DyMa#}>xX*}!{jheM}^J>8idjh#D7}^>Z>0!tFozTZjYGm
zEqfwF*+K%s@uTy}7D)$&Mbpd4$)Y(s8H4NKuNNRU!c)YLLak@o6(B^0Lb-IfBv;Qa
zz}C<us$aqwYQv*%1Vha^fYtnmh0IHmwf=xd*@C1;?s-DER1A=57FQ)XRoC@9yk;Ml
z3RiC*DO}@oi|9r~)vQ(ez$XA<7~^kJ-W1RS-DY#h&g?;8fmDJ#aeS7nc&{EKCA;^Z
z%WhX^z40mgrzQ=f$P}&vrsDN-PD27nZbT_l{SyTh0no@vW^d3@z(Ycw%o;5lYF!>9
zOM4$LF~z}t0R>EEwLBrn8N|&^$cYs|3Ck>f_db}6k1C}7>b{LTcwKC3$0y#wULHht
zU|`RE8XkLQ^79S)wq;@++!Ymf-Vb=d8U3XrA?Fx={{q*CuYhIMak!+;vXV-ps2(!o
z6@O7202?DE{fjN5uT*r=9`o~Pib^?fV6O@(>FHA!S_Zx;Bk>cv8_+;gfQFLGLyygb
zj`V6Iy_(8h*-IVBZCwRNLojo$R)?PIu|qX+qyI!tSZIgge-Mg-c&M4B4e8~eEB6#5
zFR_?Yfpk0SQcYU};smC$QqH$Rwwz@Iwylt;UEX!77fm~<$QIN&QNxDTNu%=c?uiB~
zBjf#Otr@wpN!u4zvbS>es2)B+pC`%wO90ZYWzv%L?d}X&Ok{>a@A0#hj*Q_vSrJFe
zY?%D9dpFkc(J>!#?uhY?Ww4Sdbv2mnU5r~X#)slT@zI6mGZuOm*(4rNNA7T9)ur*A
z6t79zH^xO8G6?gc3?&4~n;EOFUm_;f&v@JB>hY@>82^;&zr=Cuylz19!$A{wP9R!>
zaSWklcq>d89h`<=SQY?W|3?aD#w@Bh3YxH>`1oQ6r;Dghp1=zUhPl7{9QC=))!QNO
zA|3VH^Qabi5TNv#LXn;D`vipg;-k%UKuK-K_grN&Q4*(q9|7p^0bUI4xKJis?O6OZ
z816VCu>j^!1hN>9((Ll<_uN+?o-KR#$F6Otw-L)wYMu$KIx%EDCg4xORQcaoD_~XG
z;^S1@(%-gsYeDP00vY7d3oC@R!V_m=<Y`Bia%(ePTd>^pN4Vjuakj1P=%2i7EMsRz
z14&@HewqBkJc<3Ab2<#-5uYb;@zx6uj!^Yd{THpTC?`kz22UOF7nCv!`tnX<F`Vxn
zs;I^o(iSS8?+Cg=UDX?S`t1=qDq{F%2ZjlVyXOUAXDuz~p`H7g=yv<XSar-%lVtDk
zw}7tGgrIK~Ex_o*5tz>VP@+$e&&Ge>@i)6t4%?VIv{6aYxJ>Zs!<KvLj&+d8tQ7aB
zi@qXiK=QugSEu)j(DG7#2L3-*fCW1|2SPuI>_S+VkO%3u&K(l=ck6k@cjp2Efcn1E
zA!NOv(ejtu=`oR3q#fxS3g`+lM}6A4&$QnSEvLtu|6p~tQBQyfpysrP%%FOgy1q9w
znCwD%;xIz(py>ITiBdfAKSTge^V;(K1x)uH{?LvdXr`i%8scy*Etk)T`Ltvp=D~2X
z8;)=Tl;SGs2mO7J`3)vk=8RJCyJ#l&=d{J^D>*xzm8}KNbXikgfSeG^0<ZR|f^WQS
zXWRNgYsGZvmu@T<6So-#mB$_jTl2H=HOcgI&U?a+S=HalF#DdGe8e9fR`Yh53M=Z?
zLrY6+w*8y9WB$eM4EpW=@}U`iuXiE4FEl2G$jr>ls3%=~#OCNns=W!&g}s&n@TYRU
zd|X{ydtvxTp8{PF;txp{by<-zY>V01GZno0v6KNs;aeXxktUHbOPVAP?^B?eAW#;e
z__wZU?1Ya)aZ<U_;EqnFswS5udLGzBrc06TkE5|AJ&#aVinu-<pN>TudTWQ>j?&J|
z$!{KbE-hye#T4jM>auzl!Ro8S?_~kI3fYW4L8$Ki?RiC2)%77?Al}_Uh~Tpw69^0p
zoM|ODYymHg!`dhxGYd=Sx?RH`kOGJGb`~2OPkt6=5TV<z#huadQaZd1G|xgi{Mblw
zA|k?DIw}>tB^5l&f%ASmzB2c~heK^hXddt$0AgxX%v=1L1T*WwCL)ry-F>-1iwW1P
z|GOqJJuq-CQa%82cYI0WI!7N#BqSs>mmMupJ?rc7nAv`|3e)?v>kmS}Kfm2|Qzh{8
z*@>g<r_`kup3z0LusQccnr@Eth2WcSA3P!gl>!V1j8Bwu79Q3fTpk&BP|P9xYaV)f
zr{HcCO~!S5S{z*5Bj>2{av^6I7wLkbYMSY)$E<U6b3`;Wk@r8E&sHs_%RZPW+nafe
zr~%YFi>@_)L<`L<J<gm5Ctpd)0^qdy5K(0|ty^JNimT!nKiXIVKMDTLLiBB5y`Z-2
zzbmE8NW6Ov^<T-axDxuc|5Kw_9uR?sDCqTT<_Xtv+ntjCmqvm6`d{jRFQ077HoF+e
zk_u1I_ZOF&&k7l15$WZNfshl7e{=PxcGBpJgM_=3WM;j&)V9dFx^>Vgijq=<#`N_A
zFiOC&d2UN|z<2V?Foa`q9+%-zK;aaL4KUY)ycBa|WM-C=s5k4pc+fD=-yf5bf(;5m
zkHit4+qBQ#ot`q!UF&xbsx(g~>?)X8T92A^gw~RV&lviAK4S#iz<C3{k<r2(54gA)
zM@TRKEe|)-%B=O*Yz4)B;U0Kxq-|_C6O<no^?j<Yt~SUHzUO)H{rUFs&mL1C>yv?c
zzh17YsQ3g}`;==Ih2P}gXzD-y)vKqiGrsk_=#3R;bl1WXJuz{g6t*@dSX%_HTv3?M
zu)qr$U;{$FwQMNpm&W9vKPZmE5BfqQYu<dPHl1UN{r<xIh3%jW`K2AM(~HC&9RJrY
zW$Q(e7ufZ-Rt--n{Z5ajV^dNN-k(Obnn>TgiE;6wx7Xv}NX~b}`ATsqyp*}@4Erox
zM|!k3;VoD7nQ1@w$zu8Amj4EvXnuxX|8Z>jHL#yUK{v&Xkkr4R7VcdBpEWCUOW{2A
zp%&>Ln49%S+<2G)H>xR!iTy)Eu}c421p}HtyPLU1CFQPQwIGfEGy3%mjRG76y#Q=t
z3_v#mpM{3n?YhUDa(7(A+)6g(4gTppCo~ihd4AlPT2Ohj9T1Rx>;IsAxmW|~Go-s)
zZ32SJZt>tdkKZ%FN5fM2+F7Ngr1lMegea+~Vq;>Sh&=vb1ZgZZB7%DG5tYhmO%WcA
zzFNfX2$`u43WRzL4K3v5)C|HPxmaxURKT=z1KM3zRlYxmG9)W6K=xg=_Wr#KxFDP<
z^u9c_1d52w!v2#F0GsRUef_{y)J}U7fUe=YH$yrh$kun3RkBtGp`JlRX!={0XTAO!
zT;5GvAq3pEic%y<+iFTf^a47|nY8iOcXBKq*aIq{6!7EDe^ORP0;QHyF?<g}+M!FB
zf#=!yT-@<;2%eiCI`0ZQ)_&$LlI8^Ac62zX=`k5;I%G&uhJz2xT6JVA#q_rgFo~-i
z0(-xBe5qlb6j%G|>%q!BI<{R9G+!bxGJ~`!#C<3SQH<{4iVn6MFv(c2(b(CN;>?ig
zzb?h>yTjZDTn_g<S5IaA69oJv(Ir83n0>ZKAPp7@o{Se?te7BS`9h<#{N0No-QIG2
zsM`U!UyX2Wu6UI{DHS*757E^wzd>US{6r=8CUS}Hr8%guKujrc-{!All?AKEqm55!
zA62<8-X0t^E-!5aW-UWsDF-q3;Q@YhbfyN7fr$>9QQGxduMdkV;z@SwGL|tHg9?7g
zkkGFEy$Yg!Fc1Yi6!IttM+p0Z_CLgTfOX~!kXPmxw2W<I*2?&S50~+jDNtZWXibz5
z<xVda1RWP(fGqIErSUl?U%Nu^BtgZuGD{=fL|4;XR?CarH7&0>io^wRW;pu*4&dwH
zx8`L*ELGQ^Lc^8UkE=TBRgU&=aRAa1ITrq1V8Zd76dow^i{E{y0JI}xJU)27m<M=z
zbhp0WM*=7RQy{b8nRU@RCUZoF<}<U_8HbHI7(CfO)>w1p3h2;3W~Tt8#9qe%4#-Ja
z;7YFCyrDV3dLT5_&FYiQtfk*i`UiZE6xNvvz3vM7sOenx8S{HJlCcf($So;9kX`y7
zymP$5e~aTsZ7qo!+VO7ZWr|eZ(nAHfbM?|#{y2|n3ZzP3joJh(1GZ}$b$|w(!i=gE
z@pE=Mhf;oqZ3~8NGdi(|#s=AdQas{p-|#7dWc<M(_qr``b_%qt--v40FQD}ipY_q&
zKMuJwMtVM52JCM7UJOxT;h^&z(16dXD)VqdNjChP2ah*_LIug@AIrD3n27VcgXG0n
z4Uk%iaVn~Jy?rvVC)M`aXO@cT%~lZFPl7K@DTkI{JHye37$B!qt-fV&_V_cUPHPpK
z!n1}H&>q4>yG0N@ZeKz{SqNdbaRMjE2YG>V7vuJ4x?pnzo&@(Uqxde!Qd*kX#1~tb
zwRX5TvGS+OM60($d>*2neE%r#U?&05fiIHTsH{Ro91g@F(!MEx;|d*YNoO9p9w8$j
z29f@`Wx>)T%qmzqAdl!uN}Ss~xUQ~~h79;Mj0BeXi6A|x>$@}zmc==c&0?QBiejjZ
z2-aw-yos&`s)H&Boq~#v<x*&&M$4H!ZnOfg)#8C+D-a#{Wpqhkt2A#YB}t52clX^s
z?=8tXbe2{atF2_Pz;Cr67zz%suSw+w>%r~8Us{a;*qebiZQr8q)wcUEjbHiC3XkEv
z%nv0rCB%)DV&a_xlMEnEu8{vHg3TNU^$M(Vw|Te#<b&u3#tg^$%u9M<9586GLIln`
zA!|M-ig*<(OJixpKk``q_%)z$oUrwD+#G>OS{MKVW`Cna&fCp|5%ucK<oEz1>suHc
zkvzMVPpsg9?L<7VS2(d!FcFJ~GCw{r?;b?BGAq}`Zja1Em1+t|bQf0GJcH)M`0Oq^
zM0PX<&F@Vi<Cb^ZQFGfT<fOcKFRWvS80RqBA89zi5ySt7@{$JB;e-PVbSR;J?fcPq
zL<IFI;9I_W2%jV3N<P*D%IdGz5u{9u3KE3^V6F=p`#=K}x1cOui~xb*9p0xFv$9fx
zzPy{k=}I{Q#T4xG--ACt9f#zt^Xu?A7M{9MkJg#EpkoMc8d~cQW&dsYeaINVHJkuN
z{$FXUP1Y2GM2iPV)44)UuJA8qK~8S)0vdR*R70Ca1FwPy4#jZ(GpFsD%ODGsH_$(*
z3qH)h=jlQ{nQd<RE*J`HPPX6)tfhoRIHB&o*8=|2s?#bG8PvF+C`q<UEba0I2-n!6
z{4%%7^&t1R<yH4m<y=B<v<kH6wT2oK8#D~Hs=K%Cl2bFhPAzxb^T0xz5WRwsP9W$>
z_TLPeDO7q^`6v$fywTxj(6&HKAvQhTZ-H-f2)vyuap-80;#A6J8$0+dp?hI$r})cd
zy?Hd}*5-nU3lZ{6GBPqTv9bQ*UcbFfkM^eh=;qrZ1C-x1RXWaZ&4$ob`{-l&LusZ5
z7jtH2X0o`K5R4xP#XRrbTzMh3b<&87Sir$VWRGmFx1O^OZ~Zm3klam*@MyoZkbc3R
zTkU?`jU*LOG8<bIdFbbiY}Z?+qle~3GG2btFIsPXZBb$RZ8LmpY|*0lv=#$EYI;2$
zrHru5sKP(P2gP=iq6^=@8-trlr`QNBicT_WaB$FIeQ$j{1dKB?h(<uhV-P_oc}xTs
zAo+V^GCVDf8ubamK+`9`D-b?kKAJYt@bqwGzuT=o0JIYZ{(dp}F(E%Ic?hiKeDLsD
zo;6~ix?Uus4y}zX>8RlZ0<J&NY(+O@bfPbEGIzBEL~hU-oHfJ1S_s0lJb&k)y(7Lc
z-S>ZwHbh7Md$ghG_pKyO|4F-c_~d&Bqw5vZ1kmE_O?eT@U@>30f=E2>EBU!76hX*2
z(U$}b2VM4Cqo&cj`G)CfhLdwk@^&YZz3|b{5c2wYAHO;|f^#ePtIIPDJ=Yr16rct)
z`^I0w=c<cWX=jmV7$*XEguswMTeDIQ3_Q5QE?#)~^5s=Fw#K)23MQg@Qm+#R1qI#n
z>8kfR+5(Hs;QHwC!Ov~*gP~9=S4dz$g4cfP#=$CkBF;ziE*WEE;|yGx+hdyAQ9q*W
z@aV{*HY<gU5I=ZUrtIZcpDJ6fM%;>M%Jz`QVlXN%%>>s*Y-cdzM~q9uuGH^K=PlpR
zG@SOIM<K%r-`Xf9GEySg^3yPWd_hpXtn4{YPtR8aRpjDu@q&Va_k55!jO*nM`wjOV
zY2Cf++Ep)1O(0DDd}^p<eBGxZ#&-~i>(jMZU*j7>kA>0EYsEUlAnH2ku9D(Vm^LHQ
zZycx9H!T;fnj5X9z)K-C;$J5!6q|?AOY?|hkYnB}h-jK(dHnhB5wGrlS}33OohOH#
z@0qr@XW2Pl#4RirLLDdW!&g=hF_~Rp=1k)hANZy#_I)=5)M@uhH-X+{esH_~BbuCB
z{VSS~($SqpuFucE*>xRxtcKF5^ArSCX(2i+ENrzt{e*~`;z@|pul)hjDs>EY@%{_2
zk&G*T|Im#Y(;@`bc|?F}F+m9WPu8f3D00w6#!9VVgY4|=ihhq**gqeqij;3@Z?7p^
zUR!G^)UPS@P?OTNptyAD`)_F^O5~iGgu&%>xA4I;a5*tvyGBw~RW(}a$OImu5*W)t
zefn#0VrGqkah>>Nk;hkjf2K|w%{B$?@6Sh<n6?wFP1cY<5xC=wgz3q_BA@l()d)&K
zoF2*38%<43TYonFrcFoArVQjx0$_ZD>;8xHWZ@W(Mhb6&8?(Q^zu0zM2<&VqDGv`3
zbaZa~O0`rWe7$m;Mjqd7+#5f?UZdbMIS)o9_DyLll;qL3p5CdcD8Hi$zkn`tSx_R&
z?WZ>cy@(L6H*u`nJZKSx`T5v=eSLX(c{9JjL;w~IY}XnZn6|RCu?f_D555d<B;P)G
z{P-G*o`t2gSsbaGj4Os+U0s}57bp-=I<PEv=guW?Yc)95g!wyczod#%JEWWPI&JmZ
zY;fQFL%vyey0cJ6F7v9Y>hiN^&+NW@$UAieP>yl_JiUm>w`?DyvHa7yx;?|%y_4|0
zDZlNhBc9unUQu`i1WDs(doSk;>^`z;;@Aw$LLd?%bH${mZslv6{9gNpg}1O~>eu}5
z5BvL8_0gI9|9Y{O{Qq!LZaEmF;{U2x-<V{n#s-0~<MU@3Gc#r&Gm1gPO%rwb{64aI
z_S0?O9g>!|HpUw_sHcyn_8exLFe-$=!B)9iSVKfh8#OwrgN%QHI7|KYR%U8KMg}3E
zM)0(Dw7YNm+<W+th1Pf9xTU3qo|6+Vp7pl8lG3wd7l!l0@9qb{P+DMcFmjE;u4~ET
zA~r%)N!<E4WxDufx{(ltStTWo#XLcUvTknIeYe96KUoSe3J8!khY+@pl|7nu7X2<%
zyH{@v^KZy%Vv>`w;G@B`+p)g-$0g&|ty_GLhB@IA7}?mcA<8g5a?}OVSEE5h5lc%;
z&vjj&CIyY0{I<lwYb-)S@`n!}rf*PAzB>517+x{ud5c=yQ=3}CTW7*!@g<|6;2r3N
zJ8YA947uat;^d^I{e|7PUV(Kk3Y@udOB)+xGFW|+lUxwTEomZ;TN(o{+=M47s;H1g
zOP*Bba)%dcX=udqQs9})wLF{aWX(v3{yBAYrC^BtmuH>(uRev=lRrm6on{xsT-S&!
zdeRfeYqy$k8Tt7&Yfn#(pwZRly#GOtnVA`@W|F(23dWmm>gv7#6`T}OG$R+6s`p;a
z^ll4_2Hl^-kp?n8KE62~zk_hz6qUIizmt!U_=t*Xcfuxu_olAHz>(_l@uDR8X2WSc
zieBP)OSYnDuieMa+WMmCOSKzYpUxp4h<xzb0*-gS1h~F*hJ4tM-`U+w22!V+n2`Bb
ziW2J~nDz)g)YI30iK2I1A7cq5g4?EX9s}bUmtJ{?EjtItO&IvvAD=$C5*Tx`bpHJL
zP?*lD>Oy9)wAm9at*t31a9^zsPfT#iM^dtYwX@~j{f`SzWA>)p#CHd^Ob&l7j~rJI
zER$|;Z_fph8^lDrI6GsN>2giE&*L)$5+Sz`)dEsCOU1Ve`wSGlTKWwUhnKmjPKz(y
z!sw1?F4(ni)F0VxZ#5I*GC)ucC1M!+O)i2H92?$6R8(|92;!fSk55DRWw(DlXSEU_
z>XG+<4av#Z?bH8dNbZ%b%Q_hz96XQewT2;_xVVcz16ED$K^(t+^QLppXS&`8@#vbE
zWKEc#93KD!Xs6X{X?eIE?H3`oSxZte>6$w3IOR3H3%8#ha_ruu8y%cD-iMz_YY%>;
z?u@9O71fE2I)GHL4e*WbdwDgrs0s6x!F=_Cq2pBEK-CNhn$!JuNr%H<R;W+0ObW$d
z?VJGDItsGUO}H|jqu<7GzSQ%zol+r~vthv0{G*2hcd(t#D;T+*%7sZradGk9WoNr~
z7f~9h;x`64R&L8+0U>vBAosR{t}Z1C5sDid8-ba&v*l^ngnC<7&m%yI|1==rppLJ_
z_S**x@M{Y@%t#d#K}IY5x!7a+=WrJpI7MbcNx?b&j5YlUTn7}61uwU<={-1LsMlU}
z<m4l(6X4yh-IE>Fx-A}*lXC^;L@|}$T!clr!`$1}OaLJ|YO^k5sp#l|me#p|#@<zG
z@&<DO%A=aq!Du81foHPIz-92HVcPr!<!64isG)7D&a0tpJ#p`k_4K*pDT*<6cJ{4>
z_}eBho`{8wz5H|49o`bMxw-k3biWgrwn|{rLRjo~h7Gj9A;*vVjBk2Pd2N0KKD_p4
zx0pNJ9b`ThasUO#-FH$ZCXC4BUKPz|o!?UQk$2hS2diZQr4FC3>Q_J-yqfsgVDkiK
z`gT1-p4IjsiPL+hlXZVCoF^tNjgu}qI$FWPg4ISSRqyD_rkA(3chM$pYFBV!A-AtR
zo-u4@UO|B@xSNsjO$v%o{TkQVp?iimjg07>Md34}l@%2~0nDC=LLeigVASg)oEa*=
z>Gt;a?DF#Rk*d|zRgcY(V!zWpzjnZHaEJ+wngX$rAxGcCfec1TNjlio2G~{avIaC4
z7Z;Rgv_yX|80!l492|K0`T5n=)n{C5VQv%L=0DYomdvpkwvg=&Ae8vDnBh0|$Ws9p
z`1I-1i48BP*-^CO?9+P_6AvI&lERfcrLK=x$iC5JU}7?L;0`Pte_t^b*59v*WYH5C
z5p9tr6lXrYzFB7vsA9FO?ko6`Ge=g@P~f40(`~m`4~MSCb=|3E51`#ucJ{dOwdn>r
z_#TP-va&J;`cHrOAg(`;t8Gmkg`8CPASk8It781FET9|(6BCo7{MQ2KJs3NKYo}Xn
zS%qxJn4tsU27!a2sku4mo6T?MktNJOs|_awaGXp)+W^Oe?uq52NAcsFhV>7aSy{tr
z#9TuIE)a%+z2CrP)nCuP?hf@0clEW=5?T25<5e^sXOGW$L{NL)=H)4#$HWW+gSQ$a
zefeWJ>7;;y=U%!^#h&GFnfG_Y2_^QW5|Wa_F3@>zy^Z}v!m36JF-d}akz7%k89paF
z!4!Ri5z5MfEvv--%sFBzDh}V@A2T{_AzjHne*6(8tz(cc6us}!_;i?T2tlV_&@SX_
z8yiw2U+>QZAo<!GWEw_x_ONj^cWkmi=Vk-HKQ6GpHys>`6>r~uUcpMgd4A?CoqgSI
z(nL;Oo&JrJ{dt*!p|IZG-L+yN_{RmlBL6Oc`Zjon?!nA<&CA?(?^MpppMZe2Wtr|a
z#6nH0dc0fZ2l|D&J^c*d!+c11B<zXGaDwG+CMx^ZSJiU0j(46nAn97>jpj4p*pU#Z
z5F!h|dE-~FEK*P8<mJOcXgw82ZOR*K@v7BaYxHjbs6>h^C`_TV%^~3#r^ky;vko1|
z4m6V+dOHp^Y)0SM2_?!p-S|0T=*I=lD+Z_^q0!OgkA~mPCXpD(Fa$noWj%vY#0QC|
ZbSCxh%Zc<2GU2*VN;lQy3T2F*{x5SSf?NOq

literal 0
HcmV?d00001

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 7e88b1b61c6ef7..03856956ace692 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -85,7 +85,9 @@ def get_lr_(self, progress):
 
 class WarmupCosineSchedule(_LRSchedule):
     """
-    Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
     """
     warn_t_total = True
     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
@@ -108,7 +110,9 @@ def get_lr_(self, progress):
 
 class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
     """
-    Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1).
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
     """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
@@ -125,9 +129,9 @@ def get_lr_(self, progress):
 
 class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
     """
-    Cosine learning rate schedule with linear warmups and linear warmup restarts.
-    The same warmup rate is used for warmup restarts as for initial warmup.
-    The total effective fraction of warmup steps over all cycles is warmup * cycles!
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         assert(warmup * cycles < 1.)
@@ -146,7 +150,8 @@ def get_lr_(self, progress):
 
 class WarmupConstantSchedule(_LRSchedule):
     """
-    Applies linear warmup. After warmup always returns 1..
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
     """
     def get_lr_(self, progress):
         if progress < self.warmup:
@@ -156,7 +161,8 @@ def get_lr_(self, progress):
 
 class WarmupLinearSchedule(_LRSchedule):
     """
-    Linear warmup. Linear decay after warmup.
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
     """
     warn_t_total = True
     def get_lr_(self, progress):
@@ -182,8 +188,9 @@ class BertAdam(Optimizer):
         t_total: total number of training steps for the learning
             rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
         schedule: schedule to use for the warmup (see above).
-            Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
-            Default: 'warmup_linear'
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
         b1: Adams b1. Default: 0.9
         b2: Adams b2. Default: 0.999
         e: Adams epsilon. Default: 1e-6
@@ -208,8 +215,8 @@ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
             if warmup != -1 or t_total != -1:
-                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
-                               "Please specify custom warmup and t_total in LRSchedule object.")
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
         defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 0cf0494e20634d..bff4ebe61f462f 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -48,8 +48,8 @@ def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_t
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
             if warmup != -1 or t_total != -1:
-                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
-                               "Please specify custom warmup and t_total in LRSchedule object.")
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
         defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index f52aeb506b3a73..bc12ff8a908ea2 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -22,7 +22,8 @@
 
 from pytorch_pretrained_bert import BertAdam
 from pytorch_pretrained_bert import OpenAIAdam
-from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule
+from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+    WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
 import numpy as np
 
 
@@ -86,6 +87,18 @@ def test_it(self):
         self.assertTrue(np.allclose(expected_zeros, 0))
 
 
+class TestSchedulePlot(unittest.TestCase):
+    def test_plot_schedule(self):
+        import matplotlib as mpl
+        from matplotlib import pyplot as plt
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=.1, t_total=1000., cycles=3.)
+        x = np.arange(0, 1000)
+        y = [m.get_lr(xe) for xe in x]
+        y = np.asarray(y)
+        plt.figure(figsize=(9, 2))
+        plt.plot(y)
+        #plt.grid(True)
+        plt.show()
 
 
 if __name__ == "__main__":

From 331a46ff04ba6ce352ab78fc8da5a12552c9222e Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Thu, 25 Apr 2019 16:04:37 +0200
Subject: [PATCH 022/144] - replaced OpenAIGPTAdam with OpenAIAdam in docs

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b348fde28c3606..e8b4f3c6b688d8 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ This package comprises the following classes that can be imported in Python and
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 - Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
-  - `OpenAIGPTAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+  - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
@@ -994,12 +994,12 @@ The optimizer accepts the following arguments:
 - `weight_decay:` Weight decay. Default : `0.01`
 - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
 
-#### `OpenAIGPTAdam`
+#### `OpenAIAdam`
 
-`OpenAIGPTAdam` is similar to `BertAdam`.
-The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as in the regular Adam optimizer.
+`OpenAIAdam` is similar to `BertAdam`.
+The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in the regular Adam optimizer.
 
-`OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
+`OpenAIAdam` accepts the same arguments as `BertAdam`.
 
 #### Learning Rate Schedules
 The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.

From 56a47ce2b78cb4b5cb5bb8b1f18b0148f2a4fbef Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Thu, 25 Apr 2019 16:05:28 +0200
Subject: [PATCH 023/144] - replaced OpenAIGPTAdam with OpenAIAdam in docs

---
 tests/optimization_test.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index bc12ff8a908ea2..c6924bd4bcbe31 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -87,19 +87,5 @@ def test_it(self):
         self.assertTrue(np.allclose(expected_zeros, 0))
 
 
-class TestSchedulePlot(unittest.TestCase):
-    def test_plot_schedule(self):
-        import matplotlib as mpl
-        from matplotlib import pyplot as plt
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=.1, t_total=1000., cycles=3.)
-        x = np.arange(0, 1000)
-        y = [m.get_lr(xe) for xe in x]
-        y = np.asarray(y)
-        plt.figure(figsize=(9, 2))
-        plt.plot(y)
-        #plt.grid(True)
-        plt.show()
-
-
 if __name__ == "__main__":
     unittest.main()

From b832d5bb8a6dfc5965015b828e577677eace601e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 25 Apr 2019 21:37:47 +0200
Subject: [PATCH 024/144] Release: 0.6.2

---
 pytorch_pretrained_bert/__init__.py | 2 +-
 setup.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 28d215d8bdfc5b..99706fde4366d4 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
diff --git a/setup.py b/setup.py
index 4cf9739a4b19c3..fe7990447df444 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="pytorch_pretrained_bert",
-    version="0.6.1",
+    version="0.6.2",
     author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",

From 3963d57c89a0a49c5a2e9dd17958ba29934873e0 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Sat, 27 Apr 2019 10:52:53 -0700
Subject: [PATCH 025/144] move pytroch_pretrained_bert cache folder under same
 path as torch

---
 hubconf.py                            |  2 +-
 pytorch_pretrained_bert/file_utils.py | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 755e181d201f3f..193c018ee0443f 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -84,7 +84,7 @@ def bertTokenizer(*args, **kwargs):
 
     Example:
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 17bdd258eaecd8..605c8412353e34 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -22,6 +22,15 @@
 from botocore.exceptions import ClientError
 from tqdm import tqdm
 
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv('TORCH_HOME', os.path.join(
+            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+
 try:
     from urllib.parse import urlparse
 except ImportError:
@@ -29,11 +38,11 @@
 
 try:
     from pathlib import Path
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                   Path.home() / '.pytorch_pretrained_bert'))
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
 except (AttributeError, ImportError):
     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+                                              default_cache_path)
 
 CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"

From 87b9ec3843f7f9a81253075f92c9e6537ecefe1c Mon Sep 17 00:00:00 2001
From: Mathieu Prouveur <mathieu@sancare.fr>
Date: Mon, 29 Apr 2019 12:58:29 +0200
Subject: [PATCH 026/144] Fix tr_loss rescaling factor using global_step

---
 examples/run_classifier.py | 6 +++---
 examples/run_swag.py       | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index e14788cacb05de..f678525b1551e8 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -845,7 +845,7 @@ def main():
                 else:
                     loss.backward()
 
-                tr_loss += loss.item() * args.gradient_accumulation_steps
+                tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
@@ -936,7 +936,7 @@ def main():
         elif output_mode == "regression":
             preds = np.squeeze(preds)
         result = compute_metrics(task_name, preds, all_label_ids.numpy())
-        loss = tr_loss/nb_tr_steps if args.do_train else None
+        loss = tr_loss/global_step if args.do_train else None
 
         result['eval_loss'] = eval_loss
         result['global_step'] = global_step
@@ -1004,7 +1004,7 @@ def main():
             preds = preds[0]
             preds = np.argmax(preds, axis=1)
             result = compute_metrics(task_name, preds, all_label_ids.numpy())
-            loss = tr_loss/nb_tr_steps if args.do_train else None
+            loss = tr_loss/global_step if args.do_train else None
 
             result['eval_loss'] = eval_loss
             result['global_step'] = global_step
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 5a65d7a7487516..4fb32549cbdf9f 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -452,7 +452,7 @@ def main():
                     loss = loss * args.loss_scale
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
-                tr_loss += loss.item() * args.gradient_accumulation_steps
+                tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
 
@@ -537,7 +537,7 @@ def main():
         result = {'eval_loss': eval_loss,
                   'eval_accuracy': eval_accuracy,
                   'global_step': global_step,
-                  'loss': tr_loss/nb_tr_steps}
+                  'loss': tr_loss/global_step}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:

From c30139a013f8d65dc691efaac107691bb798419e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 10:45:26 +0200
Subject: [PATCH 027/144] add special tokens to gpt-2

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 60 +++++++++++++++++++---
 pytorch_pretrained_bert/modeling_openai.py |  6 +--
 tests/modeling_gpt2_test.py                | 10 ++--
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 063c525d98cd91..05a748d43c99c2 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -107,6 +107,7 @@ class GPT2Config(object):
     def __init__(
         self,
         vocab_size_or_config_json_file=50257,
+        n_special=0,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -119,6 +120,7 @@ def __init__(
 
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -137,6 +139,7 @@ def __init__(
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -150,6 +153,10 @@ def __init__(
                 "or the path to a pretrained model config file (str)"
             )
 
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `GPT2Config` from a Python dictionary of parameters."""
@@ -290,11 +297,12 @@ class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
         self.n_embd = config.n_embd
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights):
         embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
@@ -345,7 +353,7 @@ def __init__(self, config, *inputs, **kwargs):
             )
         self.config = config
 
-    def set_tied(self):
+    def set_num_special_tokens(self, num_special_tokens):
         pass
 
     def init_weights(self, module):
@@ -475,14 +483,32 @@ def load(module, prefix=""):
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
 
-        # Make sure we are still sharing the output and input embeddings after loading weights
-        model.set_tied()
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
         return model
 
 
 class GPT2Model(GPT2PreTrainedModel):
     """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
 
+    GPT-2 use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
     Params:
         config: a GPT2Config class instance with the configuration to build a new model
 
@@ -529,6 +555,20 @@ def __init__(self, config):
 
         self.apply(self.init_weights)
 
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.wte
+        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.wte.to(old_embed.weight.device)
+        self.init_weights(self.wte)
+        # Copy word embeddings from the previous weights
+        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
     def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         if past is None:
             past_length = 0
@@ -610,9 +650,11 @@ def __init__(self, config):
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
+        self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
@@ -687,9 +729,11 @@ def __init__(self, config):
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
+        self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f956462ddbfff0..7ac3782b42ee37 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -344,11 +344,12 @@ class OpenAIGPTLMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = config.n_embd
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights):
         embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
@@ -592,8 +593,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        num_tokens = config.vocab_size + config.n_special
-        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 8f4581b37f8473..6804b794c50da0 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -41,6 +41,7 @@ def __init__(self,
                      use_token_type_ids=True,
                      use_labels=True,
                      vocab_size=99,
+                     n_special=1,
                      n_positions=33,
                      n_embd=32,
                      n_layer=5,
@@ -58,6 +59,7 @@ def __init__(self,
             self.use_token_type_ids = use_token_type_ids
             self.use_labels = use_labels
             self.vocab_size = vocab_size
+            self.n_special = n_special
             self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
@@ -69,7 +71,8 @@ def __init__(self,
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+            total_num_tokens = self.vocab_size + self.n_special
+            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
             position_ids = None
             if self.use_position_ids:
@@ -90,6 +93,7 @@ def prepare_config_and_inputs(self):
 
             config = GPT2Config(
                 vocab_size_or_config_json_file=self.vocab_size,
+                n_special=self.n_special,
                 n_positions=self.n_positions,
                 n_embd=self.n_embd,
                 n_layer=self.n_layer,
@@ -130,7 +134,7 @@ def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
             return outputs
 
         def check_gpt2_lm_head_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -157,7 +161,7 @@ def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_i
             return outputs
 
         def check_gpt2_double_heads_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])

From 1f5fc95b6831a2b41c45f51b2eaf9aa92d100ab7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 11:05:26 +0200
Subject: [PATCH 028/144] add code coverage

---
 .circleci/config.yml | 9 ++++++---
 .coveragerc          | 8 ++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 .coveragerc

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7296e07ca396e7..6f4434228a2f08 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,9 +7,11 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest ftfy spacy
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv tests/ --runslow
+            - run: codecov
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
@@ -17,10 +19,11 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest spacy
-            - run: sudo pip install ftfy==4.4.3
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv tests/ --runslow
+            - run: codecov
 workflows:
   version: 2
   build_and_test:
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000000000..fe05dda9a8f2c2
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+source=pytorch_pretrained_bert
+[report]
+exclude_lines =
+    pragma: no cover
+    raise
+    except
+    register_parameter
\ No newline at end of file

From e79ceb15331ddadbe0f0ccb857218b1ba2cca368 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 11:05:54 +0200
Subject: [PATCH 029/144] gpt-2 special tokens

---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 05a748d43c99c2..5537f93f662200 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -547,7 +547,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
     def __init__(self, config):
         super(GPT2Model, self).__init__(config)
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])

From 80f53f7380c7072263ea3af460a0464017a81e03 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 11:10:22 +0200
Subject: [PATCH 030/144] gpt-2 from_pretrained can use special tokens

---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 5537f93f662200..37c5a2d9fb5377 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -371,7 +371,7 @@ def init_weights(self, module):
 
     @classmethod
     def from_pretrained(
-        cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
+        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
     ):
         """
         Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.

From cd110835a051fe859214d067211f525f705e4354 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 11:35:40 +0200
Subject: [PATCH 031/144] coverage in circle-ci

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6f4434228a2f08..04adf715e0efb3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,7 +10,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --runslow --cov
             - run: codecov
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
@@ -22,7 +22,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --runslow --cov
             - run: codecov
 workflows:
   version: 2

From 365fb34c6c82c6bd0e63e10f079b635227c675e8 Mon Sep 17 00:00:00 2001
From: Aneesh Pappu <apappu97@gmail.com>
Date: Tue, 30 Apr 2019 13:53:04 -0700
Subject: [PATCH 032/144] small fix to remove shifting of lm labels during pre
 process of roc stories, as this shifting happens interanlly in the model

---
 examples/run_openai_gpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index cb5aa8d9cbdd3b..e9183a79ae109e 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -83,8 +83,8 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
             input_ids[i, 1, :len(with_cont2)] = with_cont2
             mc_token_ids[i, 0] = len(with_cont1) - 1
             mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
-            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            lm_labels[i, 0, :len(with_cont1)] = with_cont1
+            lm_labels[i, 1, :len(with_cont2)] = with_cont2
             mc_labels[i] = mc_label
         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))

From 74f7906db460d81e4807249aaf73290db2b7d43c Mon Sep 17 00:00:00 2001
From: Ben Mann <8enmann@gmail.com>
Date: Tue, 30 Apr 2019 19:48:22 -0700
Subject: [PATCH 033/144] Fix #537

---
 pytorch_pretrained_bert/tokenization_gpt2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 07777292a3fa50..8ffd7a68e21cfa 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -221,7 +221,10 @@ def tokenize(self, text):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 

From db98a4a48b79bd11e24acd4b9ea2fdf17075f9e5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 1 May 2019 11:40:48 +0200
Subject: [PATCH 034/144] gpt-2 tokenizer

---
 pytorch_pretrained_bert/tokenization_gpt2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 07777292a3fa50..8ffd7a68e21cfa 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -221,7 +221,10 @@ def tokenize(self, text):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 

From 74dbba64bc9a44cd6757413b720d0ff4ca33137a Mon Sep 17 00:00:00 2001
From: MottoX <micrari@gmail.com>
Date: Thu, 2 May 2019 19:09:29 +0800
Subject: [PATCH 035/144] Prepare optimizer only when args.do_train is True

---
 .../lm_finetuning/simple_lm_finetuning.py     | 57 ++++++++--------
 examples/run_classifier.py                    | 55 +++++++--------
 examples/run_openai_gpt.py                    | 27 ++++----
 examples/run_squad.py                         | 67 ++++++++++---------
 examples/run_swag.py                          | 65 +++++++++---------
 5 files changed, 138 insertions(+), 133 deletions(-)

diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 6511ead5902738..969321e91f6de9 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -534,36 +534,37 @@ def main():
         model = torch.nn.DataParallel(model)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
 
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                             t_total=num_train_optimization_steps)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
 
     global_step = 0
     if args.do_train:
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 89ab96f50a49ec..bfd02ba1428337 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -763,35 +763,36 @@ def main():
         model = torch.nn.DataParallel(model)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
 
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
         else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                             t_total=num_train_optimization_steps)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
 
     global_step = 0
     nb_tr_steps = 0
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index e9183a79ae109e..f0a14f7e87ca0d 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -183,19 +183,20 @@ def tokenize_and_encode(obj):
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
-    optimizer = OpenAIAdam(optimizer_grouped_parameters,
-                           lr=args.learning_rate,
-                           warmup=args.warmup_proportion,
-                           max_grad_norm=args.max_grad_norm,
-                           weight_decay=args.weight_decay,
-                           t_total=num_train_optimization_steps)
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+        optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                               lr=args.learning_rate,
+                               warmup=args.warmup_proportion,
+                               max_grad_norm=args.max_grad_norm,
+                               weight_decay=args.weight_decay,
+                               t_total=num_train_optimization_steps)
 
     if args.do_train:
         nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
diff --git a/examples/run_squad.py b/examples/run_squad.py
index c3fdf03774fcd9..69e419714fd2b5 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -922,40 +922,41 @@ def main():
         model = torch.nn.DataParallel(model)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-
-    # hack to remove pooler, which is not used
-    # thus it produce None grad that break apex
-    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
         else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                             t_total=num_train_optimization_steps)
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
 
     global_step = 0
     if args.do_train:
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 89f4bdf868af64..23a173a4c994ec 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -385,39 +385,40 @@ def main():
         model = torch.nn.DataParallel(model)
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-
-    # hack to remove pooler, which is not used
-    # thus it produce None grad that break apex
-    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    if args.fp16:
-        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+    if args.do_train:
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
         else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                             t_total=num_train_optimization_steps)
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
 
     global_step = 0
     if args.do_train:

From 18c8aef9d3e5c8247b36b786a20707a17284fa28 Mon Sep 17 00:00:00 2001
From: MottoX <micrari@gmail.com>
Date: Thu, 2 May 2019 19:23:36 +0800
Subject: [PATCH 036/144] Fix documentation typo

---
 examples/run_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 89ab96f50a49ec..2a53b151aa5ad7 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -271,7 +271,7 @@ def _create_examples(self, lines, set_type):
 
 
 class QqpProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
+    """Processor for the QQP data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
         """See base class."""
@@ -306,7 +306,7 @@ def _create_examples(self, lines, set_type):
 
 
 class QnliProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
+    """Processor for the QNLI data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
         """See base class."""

From e211785ada54a761bd1b0f10e3878539d9d1d72d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 2 May 2019 18:31:26 +0200
Subject: [PATCH 037/144] extract attention weights from GPT

---
 pytorch_pretrained_bert/modeling_openai.py | 48 +++++++++++++++++-----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 7ac3782b42ee37..77e9cda34947ed 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -253,7 +253,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -262,6 +262,7 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+        self.output_attentions = output_attentions
         self.c_attn = Conv1D(n_state * 3, 1, nx)
         self.c_proj = Conv1D(n_state, 1, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -278,6 +279,8 @@ def _attn(self, q, k, v):
 
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
         return torch.matmul(w, v)
 
     def merge_heads(self, x):
@@ -300,9 +303,13 @@ def forward(self, x):
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
         a = self._attn(query, key, value)
+        if self.output_attentions:
+            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
         a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a
         return a
 
 
@@ -322,19 +329,24 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
         super(Block, self).__init__()
         nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.output_attentions = output_attentions
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
     def forward(self, x):
         a = self.attn(x)
+        if self.output_attentions:
+            attentions, a = a
         n = self.ln_1(x + a)
         m = self.mlp(n)
         h = self.ln_2(n + m)
+        if self.output_attentions:
+            return attentions, h
         return h
 
 
@@ -591,12 +603,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(OpenAIGPTModel, self).__init__(config)
+        self.output_attentions = output_attentions
         self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
@@ -639,9 +652,16 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
         # Add the position information to the input embeddings
         # h = e.sum(dim=2)
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        all_attentions = []
         for block in self.h:
-            hidden_states = block(hidden_states)
+            if self.output_attentions:
+                attentions, hidden_states = block(hidden_states)
+                all_attentions.append(attentions)
+            else:
+                hidden_states = block(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
+        if self.output_attentions:
+            return all_attentions, hidden_states.view(*output_shape)
         return hidden_states.view(*output_shape)
 
 
@@ -701,9 +721,9 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
@@ -716,6 +736,8 @@ def set_num_special_tokens(self, num_special_tokens):
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
@@ -726,6 +748,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits
         return lm_logits
 
 
@@ -790,9 +814,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
@@ -806,6 +830,8 @@ def set_num_special_tokens(self, num_special_tokens):
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
@@ -819,4 +845,6 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
         if losses:
             return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
         return lm_logits, mc_logits

From 101ab4dd8e5a9958c2e0c3642dae35ea3d327ca6 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Mon, 6 May 2019 00:26:21 +0800
Subject: [PATCH 038/144] Make the epsilon of LayerNorm configurable.

---
 pytorch_pretrained_bert/modeling.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index b9b6837193dce1..d1c4c07c983392 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -145,7 +145,8 @@ def __init__(self,
                  attention_probs_dropout_prob=0.1,
                  max_position_embeddings=512,
                  type_vocab_size=2,
-                 initializer_range=0.02):
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12):
         """Constructs BertConfig.
 
         Args:
@@ -169,6 +170,7 @@ def __init__(self,
                 `BertModel`.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -188,6 +190,7 @@ def __init__(self,
             self.max_position_embeddings = max_position_embeddings
             self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -254,7 +257,7 @@ def __init__(self, config):
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, input_ids, token_type_ids=None):
@@ -329,7 +332,7 @@ class BertSelfOutput(nn.Module):
     def __init__(self, config):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -370,7 +373,7 @@ class BertOutput(nn.Module):
     def __init__(self, config):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -434,7 +437,7 @@ def __init__(self, config):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)

From d1b6979aa57b3a214a329fd693a67a1369d65fc9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 7 May 2019 16:25:53 +0200
Subject: [PATCH 039/144] GPT-2 option to avoid predicting special tokens

---
 pytorch_pretrained_bert/modeling_gpt2.py     | 26 +++++++++++---------
 pytorch_pretrained_bert/tokenization_gpt2.py |  4 +--
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 37c5a2d9fb5377..1c579de83cd600 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -115,6 +115,7 @@ def __init__(
         n_head=12,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
+        predict_special_tokens=True
     ):
         """Constructs GPT2Config.
 
@@ -130,6 +131,7 @@ def __init__(
             layer_norm_epsilon: epsilon to use in the layer norm layers
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -147,6 +149,7 @@ def __init__(
             self.n_head = n_head
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
@@ -297,18 +300,20 @@ class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
         self.n_embd = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
-    def set_embeddings_weights(self, model_embeddings_weights):
-        embed_shape = model_embeddings_weights.shape
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
         lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
         return lm_logits
 
 
@@ -353,9 +358,6 @@ def __init__(self, config, *inputs, **kwargs):
             )
         self.config = config
 
-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
     def init_weights(self, module):
         """ Initialize the weights.
         """
@@ -650,12 +652,13 @@ def __init__(self, config):
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
         hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
@@ -729,12 +732,13 @@ def __init__(self, config):
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
         hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 8ffd7a68e21cfa..c18589b7b0a40a 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -263,8 +263,8 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     def encode(self, text):
         return self.convert_tokens_to_ids(self.tokenize(text))
 
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
+    def decode(self, tokens, skip_special_tokens=False):
+        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 

From ce863365459d5c7b96fd1b5917bc9fb00f509d18 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 7 May 2019 16:47:22 +0200
Subject: [PATCH 040/144] add predict_special_tokens option to GPT also

---
 pytorch_pretrained_bert/modeling_openai.py | 26 +++++++++++++---------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 77e9cda34947ed..be33eda1c6d4ea 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -143,6 +143,7 @@ def __init__(
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
+        predict_special_tokens=True
     ):
         """Constructs OpenAIGPTConfig.
 
@@ -165,6 +166,7 @@ def __init__(
             layer_norm_epsilon: epsilon to use in the layer norm layers
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -186,6 +188,7 @@ def __init__(
             self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
@@ -356,18 +359,21 @@ class OpenAIGPTLMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
-    def set_embeddings_weights(self, model_embeddings_weights):
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
         embed_shape = model_embeddings_weights.shape
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
         lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
         return lm_logits
 
 
@@ -428,9 +434,6 @@ def init_weights(self, module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
     @classmethod
     def from_pretrained(
         cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
@@ -613,7 +616,6 @@ def __init__(self, config, output_attentions=False):
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
-        # nn.init.normal_(self.embed.weight, std=0.02)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input embeddings with new embedding matrice if needed "
@@ -727,12 +729,13 @@ def __init__(self, config, output_attentions=False):
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
@@ -821,12 +824,13 @@ def __init__(self, config, output_attentions=False):
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
         """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)

From ea9dbea9d5b65ca6333e378ea0a8a288399640c2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 7 May 2019 23:27:18 +0200
Subject: [PATCH 041/144] update GPT2 loss computation for more flexbility

---
 pytorch_pretrained_bert/modeling_gpt2.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 1c579de83cd600..ca5a38524a17e1 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -336,6 +336,7 @@ def forward(self, hidden_states, mc_token_ids):
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
         multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
         # (bsz, num_choices)
         return multiple_choice_logits
@@ -665,9 +666,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
-
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -746,11 +746,10 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1,
-                          shift_logits.size(-1)), shift_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))

From 0efc4ab632ddf6f726bdf3ce1b9350f8ec183a2f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 8 May 2019 10:41:35 +0200
Subject: [PATCH 042/144] adding dropout to GPT-2 and embedding dropout to GPT

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 22 +++++++++++++++++++++-
 pytorch_pretrained_bert/modeling_openai.py |  5 ++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index ca5a38524a17e1..7623e4ddadbccf 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -113,6 +113,9 @@ def __init__(
         n_embd=768,
         n_layer=12,
         n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True
@@ -129,6 +132,11 @@ def __init__(
             n_head: Number of attention heads for each attention layer in
                 the Transformer encoder.
             layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             predict_special_tokens: should we predict special tokens (when the model has a LM head)
@@ -147,6 +155,9 @@ def __init__(
             self.n_embd = n_embd
             self.n_layer = n_layer
             self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
@@ -221,6 +232,8 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.scale = scale
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
     def _attn(self, q, k, v):
         w = torch.matmul(q, k)
@@ -231,6 +244,7 @@ def _attn(self, q, k, v):
         w = w * b - 1e4 * (1 - b)
 
         w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
         return torch.matmul(w, v)
 
     def merge_heads(self, x):
@@ -260,6 +274,7 @@ def forward(self, x, layer_past=None):
         a = self._attn(query, key, value)
         a = self.merge_heads(a)
         a = self.c_proj(a)
+        a = self.resid_dropout(a)
         return a, present
 
 
@@ -270,11 +285,12 @@ def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         self.c_fc = Conv1D(n_state, nx)
         self.c_proj = Conv1D(nx, n_state)
         self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)
 
     def forward(self, x):
         h = self.act(self.c_fc(x))
         h2 = self.c_proj(h)
-        return h2
+        return self.dropout(h2)
 
 
 class Block(nn.Module):
@@ -323,6 +339,7 @@ class GPT2MultipleChoiceHead(nn.Module):
     def __init__(self, config):
         super(GPT2MultipleChoiceHead, self).__init__()
         self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(config.n_embd, 1)
 
         nn.init.normal_(self.linear.weight, std=0.02)
@@ -552,6 +569,7 @@ def __init__(self, config):
         super(GPT2Model, self).__init__(config)
         self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
@@ -594,6 +612,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
         presents = []
         for block, layer_past in zip(self.h, past):
             hidden_states, present = block(hidden_states, layer_past)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index be33eda1c6d4ea..769a6b32882bcf 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -383,7 +383,6 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
     def __init__(self, config):
         super(OpenAIGPTMultipleChoiceHead, self).__init__()
         self.n_embd = config.n_embd
-        # self.multiple_choice_token = multiple_choice_token
         self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(config.n_embd, 1)
 
@@ -651,9 +650,9 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
             token_type_embeds = self.tokens_embed(token_type_ids)
         else:
             token_type_embeds = 0
-        # Add the position information to the input embeddings
-        # h = e.sum(dim=2)
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
         all_attentions = []
         for block in self.h:
             if self.output_attentions:

From 366a3b02857a1fdae447358cc76bf8abf1bf11eb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 8 May 2019 21:43:51 +0200
Subject: [PATCH 043/144] clean up in tokenization

---
 pytorch_pretrained_bert/modeling_gpt2.py       | 6 ++++--
 pytorch_pretrained_bert/tokenization_gpt2.py   | 9 ++++++++-
 pytorch_pretrained_bert/tokenization_openai.py | 2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 7623e4ddadbccf..0554442b7f8dba 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -39,8 +39,10 @@
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
+PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index c18589b7b0a40a..c66af3ff136cc9 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -37,9 +37,11 @@ def lru_cache():
 
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
 }
 PRETRAINED_MERGES_ARCHIVE_MAP = {
     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'gpt2': 1024,
@@ -263,9 +265,14 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     def encode(self, text):
         return self.convert_tokens_to_ids(self.tokenize(text))
 
-    def decode(self, tokens, skip_special_tokens=False):
+    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        if clean_up_tokenization_spaces:
+            text = text.replace('<unk>', '')
+            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
         return text
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 214a476ce9604a..c68e247e1e1fe2 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -272,7 +272,7 @@ def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=Tr
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         if clean_up_tokenization_spaces:
             out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
                     ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
                     ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
         return out_string

From 275179a0033c9e065f97ea3bd3f0a8d7e0c4a17a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 8 May 2019 22:24:42 +0200
Subject: [PATCH 044/144] output attentions in GPT-2

---
 pytorch_pretrained_bert/modeling_gpt2.py | 60 +++++++++++++++++++-----
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 0554442b7f8dba..d462fe04ef4d5a 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -223,7 +223,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -232,6 +232,7 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+        self.output_attentions = output_attentions
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -247,6 +248,8 @@ def _attn(self, q, k, v):
 
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
         return torch.matmul(w, v)
 
     def merge_heads(self, x):
@@ -274,9 +277,13 @@ def forward(self, x, layer_past=None):
             value = torch.cat((past_value, value), dim=-2)
         present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
         a = self._attn(query, key, value)
+        if self.output_attentions:
+            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
         a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a, present
         return a, present
 
 
@@ -296,19 +303,26 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
         super(Block, self).__init__()
         nx = config.n_embd
+        self.output_attentions = output_attentions
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
     def forward(self, x, layer_past=None):
-        a, present = self.attn(self.ln_1(x), layer_past=layer_past)
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past)
+        if self.output_attentions:
+            attentions, a, present = output_attn
+        else:
+            a, present = output_attn
         x = x + a
         m = self.mlp(self.ln_2(x))
         x = x + m
+        if self.output_attentions:
+            return attentions, x, present
         return x, present
 
 
@@ -567,12 +581,13 @@ class GPT2Model(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(GPT2Model, self).__init__(config)
+        self.output_attentions = output_attentions
         self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
@@ -617,11 +632,18 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         hidden_states = self.drop(hidden_states)
 
         presents = []
+        all_attentions = []
         for block, layer_past in zip(self.h, past):
-            hidden_states, present = block(hidden_states, layer_past)
+            if self.output_attentions:
+                attentions, hidden_states, present = block(hidden_states, layer_past)
+                all_attentions.append(attentions)
+            else:
+                hidden_states, present = block(hidden_states, layer_past)
             presents.append(present)
         hidden_states = self.ln_f(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
+        if self.output_attentions:
+            return all_attentions, hidden_states.view(*output_shape), presents
         return hidden_states.view(*output_shape), presents
 
 
@@ -669,9 +691,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
@@ -684,7 +706,11 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
@@ -695,6 +721,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, presents
         return lm_logits, presents
 
 
@@ -747,9 +775,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
@@ -763,7 +791,11 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
@@ -777,4 +809,6 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
         if losses:
             return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits, presents
         return lm_logits, mc_logits, presents

From 5289b4b9e0d96f1544d2bfaf30a59e1ef95acc4d Mon Sep 17 00:00:00 2001
From: burcturkoglu <burcturkoglu@gmail.com>
Date: Thu, 9 May 2019 10:51:38 +0300
Subject: [PATCH 045/144] Division to num_train_optimizer of global_step in
 lr_this_step is removed.

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 examples/lm_finetuning/simple_lm_finetuning.py     | 2 +-
 examples/run_classifier.py                         | 2 +-
 examples/run_squad.py                              | 2 +-
 examples/run_swag.py                               | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 1638b02a6fa402..400be6cdd2298e 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -315,7 +315,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
                                                                                  args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 6511ead5902738..da01daf4f0169b 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -603,7 +603,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
                                                                                  args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 89ab96f50a49ec..17d8f72c374533 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -854,7 +854,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
                                                                                  args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
diff --git a/examples/run_squad.py b/examples/run_squad.py
index c3fdf03774fcd9..c7b93ac1dd0950 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -1015,7 +1015,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
                                                                                  args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 89f4bdf868af64..eb5c869553668f 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -466,7 +466,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
                                                                                  args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step

From 00c7fd2b79e51865f581ed4b609d14ed0895dc82 Mon Sep 17 00:00:00 2001
From: burcturkoglu <burcturkoglu@gmail.com>
Date: Thu, 9 May 2019 10:57:03 +0300
Subject: [PATCH 046/144] Division to num_train_optimizer of global_step in
 lr_this_step is removed.

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 3 +--
 examples/lm_finetuning/simple_lm_finetuning.py     | 3 +--
 examples/run_classifier.py                         | 3 +--
 examples/run_squad.py                              | 3 +--
 examples/run_swag.py                               | 3 +--
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 400be6cdd2298e..cf27ef6cc6e432 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -315,8 +315,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
-                                                                                 args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index abb22508793f2b..610912675f441f 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -604,8 +604,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
-                                                                                 args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index d70f660526f23f..1ebdf9fd518153 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -855,8 +855,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
-                                                                                 args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_squad.py b/examples/run_squad.py
index a787f5d0446489..249aff7f8a4e3f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -1016,8 +1016,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
-                                                                                 args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 962b644cbe5221..5e7ac85c63c3db 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -467,8 +467,7 @@ def main():
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step,
-                                                                                 args.warmup_proportion)
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
                         for param_group in optimizer.param_groups:
                             param_group['lr'] = lr_this_step
                     optimizer.step()

From 3bf3f9596fe58b244ceed9e4a8d454e9717b8ea2 Mon Sep 17 00:00:00 2001
From: "samuel.broscheit" <samuel.broscheit@gmail.com>
Date: Sun, 12 May 2019 00:13:45 +0200
Subject: [PATCH 047/144] Fixing the issues reported in
 https://github.com/huggingface/pytorch-pretrained-BERT/issues/556

Reason for issue was that optimzation steps where computed from example size, which is different from actual size of dataloader when an example is chunked into multiple instances.

Solution in this pull request is to compute num_optimization_steps directly from len(data_loader).
---
 examples/run_classifier.py | 39 +++++++++++------------
 examples/run_openai_gpt.py |  2 +-
 examples/run_squad.py      | 63 +++++++++++++++++++-------------------
 examples/run_swag.py       | 28 ++++++++---------
 4 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 1ebdf9fd518153..eff48ca97dc9e2 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -25,6 +25,7 @@
 import sys
 
 import numpy as np
+import math
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
@@ -739,8 +740,25 @@ def main():
     num_train_optimization_steps = None
     if args.do_train:
         train_examples = processor.get_train_examples(args.data_dir)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        train_features = convert_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
@@ -798,27 +816,10 @@ def main():
     nb_tr_steps = 0
     tr_loss = 0
     if args.do_train:
-        train_features = convert_examples_to_features(
-            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index f0a14f7e87ca0d..ac5c4744916a67 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -190,7 +190,7 @@ def tokenize_and_encode(obj):
             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
             ]
-        num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
         optimizer = OpenAIAdam(optimizer_grouped_parameters,
                                lr=args.learning_rate,
                                warmup=args.warmup_proportion,
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 249aff7f8a4e3f..ae163afc91e031 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -899,8 +899,37 @@ def main():
     if args.do_train:
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        train_features = None
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                   all_start_positions, all_end_positions)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
@@ -960,41 +989,11 @@ def main():
 
     global_step = 0
     if args.do_train:
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        train_features = None
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
         logger.info("***** Running training *****")
         logger.info("  Num orig examples = %d", len(train_examples))
         logger.info("  Num split examples = %d", len(train_features))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 5e7ac85c63c3db..bdc256cf149650 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -362,8 +362,20 @@ def main():
     num_train_optimization_steps = None
     if args.do_train:
         train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        train_features = convert_examples_to_features(
+            train_examples, tokenizer, args.max_seq_length, True)
+        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
@@ -422,22 +434,10 @@ def main():
 
     global_step = 0
     if args.do_train:
-        train_features = convert_examples_to_features(
-            train_examples, tokenizer, args.max_seq_length, True)
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):

From 49a77ac16ff31b5ea938d8796bf0f4b5428774e6 Mon Sep 17 00:00:00 2001
From: "samuel.broscheit" <samuel.broscheit@gmail.com>
Date: Sun, 12 May 2019 00:31:10 +0200
Subject: [PATCH 048/144] Clean up a little bit

---
 examples/run_classifier.py | 51 +++++++++++++++++++-------------------
 examples/run_squad.py      | 46 +++++++++++++++++-----------------
 examples/run_swag.py       | 45 +++++++++++++++++----------------
 3 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index eff48ca97dc9e2..908559d5774104 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -736,9 +736,28 @@ def main():
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
-    train_examples = None
-    num_train_optimization_steps = None
+    # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
+    model = BertForSequenceClassification.from_pretrained(args.bert_model,
+              cache_dir=cache_dir,
+              num_labels=num_labels)
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     if args.do_train:
+
+        # Prepare data loader
+
         train_examples = processor.get_train_examples(args.data_dir)
         train_features = convert_examples_to_features(
             train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
@@ -762,26 +781,8 @@ def main():
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
-    # Prepare model
-    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
-    model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=cache_dir,
-              num_labels=num_labels)
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        # Prepare optimizer
 
-    # Prepare optimizer
-    if args.do_train:
         param_optimizer = list(model.named_parameters())
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         optimizer_grouped_parameters = [
@@ -812,10 +813,10 @@ def main():
                                  warmup=args.warmup_proportion,
                                  t_total=num_train_optimization_steps)
 
-    global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
-    if args.do_train:
+        global_step = 0
+        nb_tr_steps = 0
+        tr_loss = 0
+
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index ae163afc91e031..8ce8b60294f3e5 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -894,14 +894,31 @@ def main():
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
-    train_examples = None
-    num_train_optimization_steps = None
+    # Prepare model
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
+                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     if args.do_train:
+
+        # Prepare data loader
+
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
         cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
             list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        train_features = None
         try:
             with open(cached_train_features_file, "rb") as reader:
                 train_features = pickle.load(reader)
@@ -933,25 +950,8 @@ def main():
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
-    # Prepare model
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
+        # Prepare optimizer
 
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Prepare optimizer
-    if args.do_train:
         param_optimizer = list(model.named_parameters())
 
         # hack to remove pooler, which is not used
@@ -987,8 +987,8 @@ def main():
                                  warmup=args.warmup_proportion,
                                  t_total=num_train_optimization_steps)
 
-    global_step = 0
-    if args.do_train:
+        global_step = 0
+
         logger.info("***** Running training *****")
         logger.info("  Num orig examples = %d", len(train_examples))
         logger.info("  Num split examples = %d", len(train_features))
diff --git a/examples/run_swag.py b/examples/run_swag.py
index bdc256cf149650..daae3971f745c0 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -358,9 +358,27 @@ def main():
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
-    train_examples = None
-    num_train_optimization_steps = None
+    # Prepare model
+    model = BertForMultipleChoice.from_pretrained(args.bert_model,
+        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
+        num_choices=4)
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     if args.do_train:
+
+        # Prepare data loader
+
         train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
         train_features = convert_examples_to_features(
             train_examples, tokenizer, args.max_seq_length, True)
@@ -379,25 +397,8 @@ def main():
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
-    # Prepare model
-    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
-        num_choices=4)
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+        # Prepare optimizer
 
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Prepare optimizer
-    if args.do_train:
         param_optimizer = list(model.named_parameters())
 
         # hack to remove pooler, which is not used
@@ -432,8 +433,8 @@ def main():
                                  warmup=args.warmup_proportion,
                                  t_total=num_train_optimization_steps)
 
-    global_step = 0
-    if args.do_train:
+        global_step = 0
+
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)

From 94247ad6cb8404307be31e33cc38ca98a274d21e Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 13 May 2019 12:38:22 +0200
Subject: [PATCH 049/144] Make num_train_optimization_steps int

---
 examples/run_classifier.py | 2 +-
 examples/run_squad.py      | 2 +-
 examples/run_swag.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 908559d5774104..94099204deef3f 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -777,7 +777,7 @@ def main():
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
-        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 8ce8b60294f3e5..b145303fb087fc 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -946,7 +946,7 @@ def main():
         else:
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
diff --git a/examples/run_swag.py b/examples/run_swag.py
index daae3971f745c0..73cab428308852 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -393,7 +393,7 @@ def main():
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
-        num_train_optimization_steps = len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 

From 9e7bc51b95118528bb6c394d02c341cb5401e25d Mon Sep 17 00:00:00 2001
From: tguens <50817608+tguens@users.noreply.github.com>
Date: Wed, 22 May 2019 17:27:59 +0800
Subject: [PATCH 050/144] Update run_squad.py

Indentation change so that the output "nbest_predictions.json" is not empty.
---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 249aff7f8a4e3f..9f76fb8fbbfcc8 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -617,7 +617,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 all_predictions[example.qas_id] = ""
             else:
                 all_predictions[example.qas_id] = best_non_null_entry.text
-            all_nbest_json[example.qas_id] = nbest_json
+        all_nbest_json[example.qas_id] = nbest_json
 
     with open(output_prediction_file, "w") as writer:
         writer.write(json.dumps(all_predictions, indent=4) + "\n")

From c4fe56dcc0ca30d777650ba95167ec72582fdfea Mon Sep 17 00:00:00 2001
From: Ahmad Barqawi <barqawi.88@outlook.com>
Date: Mon, 27 May 2019 11:27:41 +0200
Subject: [PATCH 051/144] support latest multi language bert fine tune

fix issue of bert-base-multilingual and add support for uncased multilingual
---
 examples/lm_finetuning/pregenerate_training_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index e6c3598a9fecac..8a59e0d6a7bc6b 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -235,7 +235,7 @@ def main():
     parser.add_argument("--output_dir", type=Path, required=True)
     parser.add_argument("--bert_model", type=str, required=True,
                         choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual", "bert-base-chinese"])
+                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
     parser.add_argument("--do_lower_case", action="store_true")
 
     parser.add_argument("--reduce_memory", action="store_true",

From 1eba8b9d96ed90f6bfa1f836a94d19cae753f309 Mon Sep 17 00:00:00 2001
From: Colanim <43774355+Colanim@users.noreply.github.com>
Date: Thu, 30 May 2019 14:01:46 +0900
Subject: [PATCH 052/144] Fix link in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e8b4f3c6b688d8..b1cb84619de7ef 100644
--- a/README.md
+++ b/README.md
@@ -1033,7 +1033,7 @@ An overview of the implemented schedules:
 |-|-|
 | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
 | [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` |
-| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#Fine-tuning-with-OpenAI-GPT-Transformer-XL-and-GPT-2) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
+| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#openai-gpt-transformer-xl-and-gpt-2-running-the-examples) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
 | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|
 
 ### Training large models: introduction, tools and examples

From 4cda86b08fd307d6a0d1cffeaf2868e9b46148a9 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 30 May 2019 18:38:00 +0000
Subject: [PATCH 053/144] Update hubconf for torchhub: paths+examples+doc

---
 hubconf.py | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/hubconf.py b/hubconf.py
index 193c018ee0443f..3f69b285940500 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -84,7 +84,7 @@ def bertTokenizer(*args, **kwargs):
 
     Example:
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -100,6 +100,26 @@ def bertModel(*args, **kwargs):
     BertModel is the basic BERT Transformer model with a layer of summed token,
     position and sequence embeddings followed by a series of identical
     self-attention blocks (12 for BERT-base, 24 for BERT-large).
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        ['[CLS]', 'Who', 'was', 'Jim', 'He', '##nson', '?', '[SEP]', 'Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer', '[SEP]']
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        tensor([[101,  2627,  1108,  3104,  1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
+        # Load bertModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased', force_reload=False)
+        >>> model.eval()
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                encoded_layers, _ = model(tokens_tensor, segments_tensors)
     """
     model = BertModel.from_pretrained(*args, **kwargs)
     return model
@@ -133,6 +153,29 @@ def bertForMaskedLM(*args, **kwargs):
     """
     BertForMaskedLM includes the BertModel Transformer followed by the
     (possibly) pre-trained masked language modeling head.
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> masked_index = 8
+        >>> tokenized_text[masked_index] = '[MASK]'
+        ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForMaskedLM
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased', force_reload=False)
+        >>> model.eval()
+        # Predict all tokens
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor, segments_tensors)
+        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        'henson'
     """
     model = BertForMaskedLM.from_pretrained(*args, **kwargs)
     return model

From 96592b544bb460085bb5e2522070254849e82350 Mon Sep 17 00:00:00 2001
From: Victor SANH <victorsanh@gmail.com>
Date: Thu, 30 May 2019 15:53:13 -0400
Subject: [PATCH 054/144] default in __init__s for classification BERT models
 (#650)

---
 pytorch_pretrained_bert/modeling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index d1c4c07c983392..ac6c3374057a60 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -980,7 +980,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)
@@ -1045,7 +1045,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices):
+    def __init__(self, config, num_choices=2):
         super(BertForMultipleChoice, self).__init__(config)
         self.num_choices = num_choices
         self.bert = BertModel(config)
@@ -1115,7 +1115,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)

From 372a5c1ceec49b52c503707e9657bfaae7c236a0 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 30 May 2019 16:06:21 -0400
Subject: [PATCH 055/144] Hubconf doc - Specia case loading

---
 hubconf.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/hubconf.py b/hubconf.py
index 3f69b285940500..20ae90410a8f65 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -191,6 +191,12 @@ def bertForSequenceClassification(*args, **kwargs):
     The sequence-level classifier is a linear layer that takes as input the
     last hidden state of the first character in the input sequence
     (see Figures 3a and 3b in the BERT paper).
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2, force_reload=True)
     """
     model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
     return model
@@ -201,6 +207,12 @@ def bertForMultipleChoice(*args, **kwargs):
     """
     BertForMultipleChoice is a fine-tuning model that includes BertModel and a
     linear layer on top of the BertModel.
+
+    Args:
+    num_choices: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True)
     """
     model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
     return model
@@ -225,6 +237,12 @@ def bertForTokenClassification(*args, **kwargs):
 
     The token-level classifier is a linear layer that takes as input the last
     hidden state of the sequence.
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2, force_reload=True)
     """
     model = BertForTokenClassification.from_pretrained(*args, **kwargs)
     return model

From 0c5a4fe9c9641b8ca47b6a1ff7b994ff18ca98c6 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 00:27:18 -0400
Subject: [PATCH 056/144] modify from_pretrained for OpenAIGPT

---
 pytorch_pretrained_bert/modeling_openai.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f956462ddbfff0..8cf4117134aa00 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -419,9 +419,7 @@ def set_num_special_tokens(self, num_special_tokens):
         pass
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -434,14 +432,20 @@ def from_pretrained(
                     . `openai_gpt_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                 - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `openai-gpt-config.json` a configuration file for the model
                     . a series of NumPy files containing OpenAI TensorFlow trained weights
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]

From 4a210c9fc67661e48a0146a6833381bfd0a4ea07 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 00:28:00 -0400
Subject: [PATCH 057/144] Move bert_hubconf to hubconfs

---
 hubconfs/bert_hubconf.py | 248 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 hubconfs/bert_hubconf.py

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
new file mode 100644
index 00000000000000..20ae90410a8f65
--- /dev/null
+++ b/hubconfs/bert_hubconf.py
@@ -0,0 +1,248 @@
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling import (
+        BertModel,
+        BertForNextSentencePrediction,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        )
+
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+bert_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
+                  instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow
+                 checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models
+                   will be cached.
+        state_dict: an optional state dictionnary
+                    (collections.OrderedDict object) to use instead of Google
+                    pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def bertTokenizer(*args, **kwargs):
+    """
+    Instantiate a BertTokenizer from a pre-trained/customized vocab file
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * bert-base-uncased
+                                       * bert-large-uncased
+                                       * bert-base-cased
+                                       * bert-large-cased
+                                       * bert-base-multilingual-uncased
+                                       * bert-base-multilingual-cased
+                                       * bert-base-chinese
+    Keyword args:
+    cache_dir: an optional path to a specific directory to download and cache
+               the pre-trained model weights.
+               Default: None
+    do_lower_case: Whether to lower case the input.
+                   Only has an effect when do_wordpiece_only=False
+                   Default: True
+    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+                       Default: True
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+    never_split: List of tokens which will never be split during tokenization.
+                 Only has an effect when do_wordpiece_only=False
+                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
+
+    Example:
+        >>> sentence = 'Hello, World!'
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> toks = tokenizer.tokenize(sentence)
+        ['Hello', '##,', 'World', '##!']
+        >>> ids = tokenizer.convert_tokens_to_ids(toks)
+        [8667, 28136, 1291, 28125]
+    """
+    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertModel(*args, **kwargs):
+    """
+    BertModel is the basic BERT Transformer model with a layer of summed token,
+    position and sequence embeddings followed by a series of identical
+    self-attention blocks (12 for BERT-base, 24 for BERT-large).
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        ['[CLS]', 'Who', 'was', 'Jim', 'He', '##nson', '?', '[SEP]', 'Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer', '[SEP]']
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        tensor([[101,  2627,  1108,  3104,  1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
+        # Load bertModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased', force_reload=False)
+        >>> model.eval()
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                encoded_layers, _ = model(tokens_tensor, segments_tensors)
+    """
+    model = BertModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForNextSentencePrediction(*args, **kwargs):
+    """
+    BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence
+    classification head.
+    """
+    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForPreTraining(*args, **kwargs):
+    """
+    BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads
+        - the masked language modeling head, and
+        - the next sentence classification head.
+    """
+    model = BertForPreTraining.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMaskedLM(*args, **kwargs):
+    """
+    BertForMaskedLM includes the BertModel Transformer followed by the
+    (possibly) pre-trained masked language modeling head.
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> masked_index = 8
+        >>> tokenized_text[masked_index] = '[MASK]'
+        ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForMaskedLM
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased', force_reload=False)
+        >>> model.eval()
+        # Predict all tokens
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor, segments_tensors)
+        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        'henson'
+    """
+    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForSequenceClassification(*args, **kwargs):
+    """
+    BertForSequenceClassification is a fine-tuning model that includes
+    BertModel and a sequence-level (sequence or pair of sequences) classifier
+    on top of the BertModel.
+
+    The sequence-level classifier is a linear layer that takes as input the
+    last hidden state of the first character in the input sequence
+    (see Figures 3a and 3b in the BERT paper).
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2, force_reload=True)
+    """
+    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMultipleChoice(*args, **kwargs):
+    """
+    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
+    linear layer on top of the BertModel.
+
+    Args:
+    num_choices: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True)
+    """
+    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForQuestionAnswering(*args, **kwargs):
+    """
+    BertForQuestionAnswering is a fine-tuning model that includes BertModel
+    with a token-level classifiers on top of the full sequence of last hidden
+    states.
+    """
+    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForTokenClassification(*args, **kwargs):
+    """
+    BertForTokenClassification is a fine-tuning model that includes BertModel
+    and a token-level classifier on top of the BertModel.
+
+    The token-level classifier is a linear layer that takes as input the last
+    hidden state of the sequence.
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2, force_reload=True)
+    """
+    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
+    return model

From d0f591051cf504e51c21ec43cfc2e1df907da0d2 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 00:28:10 -0400
Subject: [PATCH 058/144] gpt_hubconf

---
 hubconf.py              | 265 +++-------------------------------------
 hubconfs/gpt_hubconf.py | 180 +++++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 248 deletions(-)
 create mode 100644 hubconfs/gpt_hubconf.py

diff --git a/hubconf.py b/hubconf.py
index 20ae90410a8f65..f5e4ae92557499 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,248 +1,17 @@
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionnary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-
-    Example:
-        >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
-        >>> toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        >>> ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-
-    Example:
-        # Load the tokenizer
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
-        #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        ['[CLS]', 'Who', 'was', 'Jim', 'He', '##nson', '?', '[SEP]', 'Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer', '[SEP]']
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        tensor([[101,  2627,  1108,  3104,  1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]])
-        >>> segments_tensors = torch.tensor([segments_ids])
-        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
-        # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased', force_reload=False)
-        >>> model.eval()
-        # Predict hidden states features for each layer
-        >>> with torch.no_grad():
-                encoded_layers, _ = model(tokens_tensor, segments_tensors)
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-
-    Example:
-        # Load the tokenizer
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
-        #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> masked_index = 8
-        >>> tokenized_text[masked_index] = '[MASK]'
-        ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
-        # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased', force_reload=False)
-        >>> model.eval()
-        # Predict all tokens
-        >>> with torch.no_grad():
-                predictions = model(tokens_tensor, segments_tensors)
-        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        'henson'
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel.
-
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2, force_reload=True)
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel.
-
-    Args:
-    num_choices: the number (>=2) of classes for the classifier.
-
-    Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True)
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states.
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel.
-
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2, force_reload=True)
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
+from hubconfs.bert_hubconf import (
+    bertTokenizer,
+    bertModel,
+    bertForNextSentencePrediction,
+    bertForPreTraining,
+    bertForMaskedLM,
+    bertForSequenceClassification,
+    bertForMultipleChoice,
+    bertForQuestionAnswering,
+    bertForTokenClassification
+)
+from hubconfs.gpt_hubconf import (
+    OpenAIGPTTokenizer,
+    OpenAIGPTModel,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTDoubleHeadsModel
+)
\ No newline at end of file
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
new file mode 100644
index 00000000000000..ae8db020615cd4
--- /dev/null
+++ b/hubconfs/gpt_hubconf.py
@@ -0,0 +1,180 @@
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.modeling_openai import (
+	OpenAIGPTModel,
+	OpenAIGPTLMHeadModel,
+	OpenAIGPTDoubleHeadsModel
+)
+
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'ftfy', 'spacy']
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt_docstring = """
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+		pretrained_model_name_or_path: either:
+			- a str with the name of a pre-trained model to load selected in the list of:
+				. `openai-gpt`
+			- a path or url to a pretrained model archive containing:
+				. `openai_gpt_config.json` a configuration file for the model
+				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+			- a path or url to a pretrained model archive containing:
+				. `openai-gpt-config.json` a configuration file for the model
+				. a series of NumPy files containing OpenAI TensorFlow trained weights
+		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+		state_dict: an optional state dictionnary (collections.OrderedDict object)
+		        	to use instead of pre-trained models
+		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def openAIGPTTokenizer(*args, **kwargs):
+    """
+    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
+	Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * openai-gpt
+    Keyword args:
+	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+					Default: None
+	max_len: An artificial maximum length to truncate tokenized sequences to;
+        	 Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+			 Default: None
+
+    Example:
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+		
+		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    """
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTModel(*args, **kwargs):
+    """
+    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
+	identical stacked masked self-attention blocks and pre-trained
+	on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states = model(tokens_tensor)
+    """
+    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTLMHeadModel(*args, **kwargs):
+    """
+    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head on top.
+
+	Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor)
+
+		# Get the predicted last token
+		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
+		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+    """
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTDoubleHeadsModel(*args, **kwargs):
+    """
+    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head and a multiple choice
+	classification head (only initialized, not pre-trained).
+
+	Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
+
+        # Load openAIGPTDoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+    """
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model

From 19ef2b0a660e97b109e82c51ace5c0cef749c401 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 00:33:33 -0400
Subject: [PATCH 059/144] Fix typo in hubconf

---
 hubconf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index f5e4ae92557499..0561c9a26b280e 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -10,8 +10,8 @@
     bertForTokenClassification
 )
 from hubconfs.gpt_hubconf import (
-    OpenAIGPTTokenizer,
-    OpenAIGPTModel,
-    OpenAIGPTLMHeadModel,
-    OpenAIGPTDoubleHeadsModel
+    openAIGPTTokenizer,
+    openAIGPTModel,
+    openAIGPTLMHeadModel,
+    openAIGPTDoubleHeadsModel
 )
\ No newline at end of file

From c8bd026ef6a1eb6f431d158e76cbdd8d5938ac39 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 00:36:58 -0400
Subject: [PATCH 060/144] move dependecies list to hubconf

---
 hubconf.py               | 2 ++
 hubconfs/bert_hubconf.py | 2 --
 hubconfs/gpt_hubconf.py  | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 0561c9a26b280e..2d69da8e796ad0 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,3 +1,5 @@
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'ftfy', 'spacy']
+
 from hubconfs.bert_hubconf import (
     bertTokenizer,
     bertModel,
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 20ae90410a8f65..67397aeec8b68f 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -10,8 +10,6 @@
         BertForTokenClassification,
         )
 
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
-
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 bert_docstring = """
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index ae8db020615cd4..8cf64b0c02e9da 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -5,8 +5,6 @@
 	OpenAIGPTDoubleHeadsModel
 )
 
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'ftfy', 'spacy']
-
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 gpt_docstring = """

From 98f5c7864f9796dc5baf44cf6973dbb3e6836261 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 01:00:29 -0400
Subject: [PATCH 061/144] decorelate dependencies + fix bug

---
 hubconf.py              | 2 +-
 hubconfs/gpt_hubconf.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 2d69da8e796ad0..ba09cbab3ca01d 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,4 +1,4 @@
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'ftfy', 'spacy']
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
 
 from hubconfs.bert_hubconf import (
     bertTokenizer,
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index 8cf64b0c02e9da..763cc593e26976 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -5,6 +5,9 @@
 	OpenAIGPTDoubleHeadsModel
 )
 
+# Dependecies that are not specified in global hubconf.py
+specific_dependencies = ['spacy', 'ftfy']
+
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 gpt_docstring = """
@@ -55,7 +58,7 @@ def openAIGPTTokenizer(*args, **kwargs):
     Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
 	Peculiarities:
         - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
         - argument special_tokens and function set_special_tokens:
             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
 
@@ -79,6 +82,7 @@ def openAIGPTTokenizer(*args, **kwargs):
 		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
         >>> tokenized_text = tokenizer.tokenize(text)
         >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
     """
     tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -143,7 +147,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
 		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
     """
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
+    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
     return model
 
 

From 45d21502f0b67eb8a5ad244d469dcc0dfb7517a7 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 31 May 2019 01:04:16 -0400
Subject: [PATCH 062/144] update doc

---
 hubconfs/gpt_hubconf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index 763cc593e26976..77162dc244f986 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -146,6 +146,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 		# Get the predicted last token
 		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
 		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        '.</w>'
     """
     model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
     return model

From a92b6dc3c1bf6e39c37ac3659a184b342c6b18a9 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 15:27:43 -0400
Subject: [PATCH 063/144] add GPT2 torchhub compatibility

---
 hubconfs/gpt2_hubconf.py                     | 165 +++++++++++++++++++
 pytorch_pretrained_bert/modeling_gpt2.py     |  13 +-
 pytorch_pretrained_bert/tokenization_gpt2.py |   2 +-
 3 files changed, 175 insertions(+), 5 deletions(-)
 create mode 100644 hubconfs/gpt2_hubconf.py

diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
new file mode 100644
index 00000000000000..29f85530b5f159
--- /dev/null
+++ b/hubconfs/gpt2_hubconf.py
@@ -0,0 +1,165 @@
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_pretrained_bert.modeling_openai import (
+	GPT2Model,
+	GPT2LMHeadModel,
+	GPT2DoubleHeadsModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt2_docstring = """
+	Params:
+		pretrained_model_name_or_path: either:
+			- a str with the name of a pre-trained model to load selected in the list of:
+				. `gpt2`
+			- a path or url to a pretrained model archive containing:
+				. `gpt2_config.json` a configuration file for the model
+				. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+			- a path or url to a pretrained model archive containing:
+				. `gpt2_config.json` a configuration file for the model
+				. a TensorFlow checkpoint with trained weights
+		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+		state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+		*inputs, **kwargs: additional input for the specific GPT-2 class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def gpt2Tokenizer(*args, **kwargs):
+    """
+    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
+	Peculiarities:
+        - Byte-level BPE
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * openai-gpt
+    Keyword args:
+	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+					Default: None
+	max_len: An artificial maximum length to truncate tokenized sequences to;
+        	 Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+			 Default: None
+
+    Example:
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+		
+		>>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2Model(*args, **kwargs):
+    """
+    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
+	identical stacked masked self-attention blocks and pre-trained
+	on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+		>>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2Model
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+		# past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+                hidden_states_1, past = model(tokens_tensor_1)
+				hidden_states_2, past = model(tokens_tensor_2, past=past)
+				
+    """
+    model = GPT2Model.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2LMHeadModel(*args, **kwargs):
+    """
+    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
+	tied (pre-trained) language modeling head on top.
+
+	Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+		>>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2LMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+		# past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+				predictions_1, past = model(tokens_tensor_1)
+				predictions_2, past = model(tokens_tensor_2, past=past)
+
+		# Get the predicted last token
+		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+		>>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2DoubleHeadsModel(*args, **kwargs):
+    """
+    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
+	tied (pre-trained) language modeling head and a multiple choice
+	classification head (only initialized, not pre-trained).
+
+	Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+		>>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
+
+        # Load openAIGPTDoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
+    """
+    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 063c525d98cd91..4939ff7a2c7c07 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -362,9 +362,7 @@ def init_weights(self, module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -382,8 +380,15 @@ def from_pretrained(
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT class
+            *inputs, **kwargs: additional input for the specific GPT2 class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 8ffd7a68e21cfa..48e2ae175f9514 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -91,7 +91,7 @@ class GPT2Tokenizer(object):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Instantiate a GPT2Tokenizer from a pre-trained model file.
         Download and cache the pre-trained model file if needed.
         """
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:

From 2576a5c6db0b2b97b79c1e649a2546d9ca6182bc Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 15:28:01 -0400
Subject: [PATCH 064/144] update hubconf for gpt2 torchhub compatibility

---
 hubconf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hubconf.py b/hubconf.py
index ba09cbab3ca01d..d643e571999373 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -16,4 +16,10 @@
     openAIGPTModel,
     openAIGPTLMHeadModel,
     openAIGPTDoubleHeadsModel
-)
\ No newline at end of file
+)
+from hubconfs.gpt2_hubconf import (
+    gpt2Tokenizer,
+    gpt2Model,
+    gpt2LMHeadModel,
+    gpt2DoubleHeadsModel
+)

From 48a58646e8fd24bd9503790386a13ff747a5acdd Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 16:06:50 -0400
Subject: [PATCH 065/144] small fix in doc

---
 hubconfs/gpt2_hubconf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 29f85530b5f159..0e802407f2686a 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -91,7 +91,6 @@ def gpt2Model(*args, **kwargs):
         >>> with torch.no_grad():
                 hidden_states_1, past = model(tokens_tensor_1)
 				hidden_states_2, past = model(tokens_tensor_2, past=past)
-				
     """
     model = GPT2Model.from_pretrained(*args, **kwargs)
     return model

From c0c7ff57519144469a5d3981f931e8a710b21892 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 16:08:24 -0400
Subject: [PATCH 066/144] add transformer xl compatibility for torchhub

---
 hubconfs/transformer_xl_hubconf.py            | 132 ++++++++++++++++++
 .../modeling_transfo_xl.py                    |  17 ++-
 2 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 hubconfs/transformer_xl_hubconf.py

diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
new file mode 100644
index 00000000000000..0bf77105534d59
--- /dev/null
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -0,0 +1,132 @@
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_pretrained_bert.modeling_transfo_xl import (
+	TransfoXLModel,
+	TransfoXLLMHeadModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+transformer_xl_docstring = """
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+	Params:
+		pretrained_model_name_or_path: either:
+			- a str with the name of a pre-trained model to load selected in the list of:
+				. `transfo-xl-wt103`
+			- a path or url to a pretrained model archive containing:
+				. `transfo_xl_config.json` a configuration file for the model
+				. `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+			- a path or url to a pretrained model archive containing:
+				. `transfo_xl_config.json` a configuration file for the model
+				. `model.chkpt` a TensorFlow checkpoint
+		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+		state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+		*inputs, **kwargs: additional input for the specific TransformerXL class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def transformerXLTokenizer(*args, **kwargs):
+    """
+    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * transfo-xl-wt103
+
+    Example:
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+		
+		>>> text = "Who was Jim Henson ?"
+        >>> tokenized_text = tokenizer.tokenize(tokenized_text)
+		>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    """
+    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLModel(*args, **kwargs):
+    """
+    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
+	identical stacked masked self-attention blocks and pre-trained
+	on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+		>>> text_2 = "Jim Henson was a puppeteer"
+		>>> tokenized_text_1 = tokenizer.tokenize(text_1)
+		>>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+		# We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                hidden_states_1, mems_1 = model(tokens_tensor_1)
+				hidden_states_2, past = model(tokens_tensor_2, past=past)
+    """
+    model = TransfoXLModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLLMHeadModel(*args, **kwargs):
+    """
+    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
+	tied (pre-trained) language modeling head on top.
+
+	Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+		>>> text_2 = "Jim Henson was a puppeteer"
+		>>> tokenized_text_1 = tokenizer.tokenize(text_1)
+		>>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+		# We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                predictions_1, mems_1 = model(tokens_tensor_1)
+				predictions_2, past = model(tokens_tensor_2, past=past)
+
+		# Get the predicted last token
+		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+		>>> assert predicted_token == 'who'
+    """
+    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index e8fffc5b60894b..e70a29af57b6b5 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -888,8 +888,7 @@ def set_num_special_tokens(self, num_special_tokens):
         pass
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -897,19 +896,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
         Params:
             pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
-                    . `transfo-xl`
+                    . `transfo-xl-wt103`
                 - a path or url to a pretrained model archive containing:
                     . `transfo_xl_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
                 - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `transfo_xl_config.json` a configuration file for the model
                     . `model.chkpt` a TensorFlow checkpoint
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific TransformerXL class
         """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]

From f836130bffc15cfb4446a94d1312f91f5b486d77 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 16:08:29 -0400
Subject: [PATCH 067/144] update hubconf

---
 hubconf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hubconf.py b/hubconf.py
index d643e571999373..f8336207802f3b 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -23,3 +23,8 @@
     gpt2LMHeadModel,
     gpt2DoubleHeadsModel
 )
+from hubconfs.transformer_xl_hubconf import (
+    transformerXLTokenizer,
+    transformerXLModel,
+    transformerXLLMHeadModel
+)

From 592d1e3aae81e6a085a6bf16581fefde9053eaed Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 16:19:32 -0400
Subject: [PATCH 068/144] fix typos

---
 hubconfs/gpt2_hubconf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 0e802407f2686a..45bdd4ebdcefc1 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,5 +1,5 @@
 from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_pretrained_bert.modeling_openai import (
+from pytorch_pretrained_bert.modeling_gpt2 import (
 	GPT2Model,
 	GPT2LMHeadModel,
 	GPT2DoubleHeadsModel
@@ -152,7 +152,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
 
-        # Load openAIGPTDoubleHeadsModel
+        # Load gpt2DoubleHeadsModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
         >>> model.eval()
 

From c198ff5f1fc472fb1341f877bc1f5fb1c020c494 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 16:28:42 -0400
Subject: [PATCH 069/144] fix typos/bugs

---
 hubconfs/gpt2_hubconf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 45bdd4ebdcefc1..8af60676b6ed09 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -130,7 +130,7 @@ def gpt2LMHeadModel(*args, **kwargs):
 		>>> predicted_token = tokenizer.decode([predicted_index])
         >>> assert predicted_token == ' who'
     """
-    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
+    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
     return model
 
 
@@ -148,9 +148,9 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
 
         #  Prepare tokenized input
 		>>> text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        >>> indexed_tokens = tokenizer.encode(text)
         >>> tokens_tensor = torch.tensor([indexed_tokens])
-		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
+		>>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ])
 
         # Load gpt2DoubleHeadsModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')

From 466a96543a46fb5328667415b7170e57611867c2 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 17:28:56 -0400
Subject: [PATCH 070/144] fix bug/typos

---
 hubconfs/transformer_xl_hubconf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index 0bf77105534d59..68ffb19fb5b3f9 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -86,7 +86,7 @@ def transformerXLModel(*args, **kwargs):
 		# We can re-use the memory cells in a subsequent call to attend a longer context
         >>> with torch.no_grad():
                 hidden_states_1, mems_1 = model(tokens_tensor_1)
-				hidden_states_2, past = model(tokens_tensor_2, past=past)
+				hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
     """
     model = TransfoXLModel.from_pretrained(*args, **kwargs)
     return model
@@ -121,7 +121,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
 		# We can re-use the memory cells in a subsequent call to attend a longer context
         >>> with torch.no_grad():
                 predictions_1, mems_1 = model(tokens_tensor_1)
-				predictions_2, past = model(tokens_tensor_2, past=past)
+				predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
 
 		# Get the predicted last token
 		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()

From 8f97f6c57f73d94311cdf405b076fbb7bdb54182 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 17:29:07 -0400
Subject: [PATCH 071/144] fix typo

cc @thomwolf
---
 pytorch_pretrained_bert/modeling_gpt2.py   | 2 +-
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 4939ff7a2c7c07..366f1b9ce77732 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -680,7 +680,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     config = modeling_gpt2.GPT2Config()
 
-    model = modeling_gpt2.GPT2LMHeadModel(config)
+    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
     lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
     ```
     """
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 8cf4117134aa00..30e16c27d494ca 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -789,7 +789,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     config = modeling_openai.OpenAIGPTConfig()
 
-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
     lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
     ```
     """

From cdf0f2fec39d4efa35cdee7df02a08c46f8e134a Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 17:42:00 -0400
Subject: [PATCH 072/144] fix typo/presentation

---
 hubconfs/gpt2_hubconf.py           | 106 ++++++++++++++---------------
 hubconfs/transformer_xl_hubconf.py |  88 ++++++++++++------------
 2 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 8af60676b6ed09..26b53e8b0333f4 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,27 +1,27 @@
 from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
 from pytorch_pretrained_bert.modeling_gpt2 import (
-	GPT2Model,
-	GPT2LMHeadModel,
-	GPT2DoubleHeadsModel
+    GPT2Model,
+    GPT2LMHeadModel,
+    GPT2DoubleHeadsModel
 )
 
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 gpt2_docstring = """
-	Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `gpt2`
-			- a path or url to a pretrained model archive containing:
-				. `gpt2_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-			- a path or url to a pretrained model archive containing:
-				. `gpt2_config.json` a configuration file for the model
-				. a TensorFlow checkpoint with trained weights
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific GPT-2 class
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `gpt2`
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . a TensorFlow checkpoint with trained weights
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific GPT-2 class
 """
 
 
@@ -35,27 +35,27 @@ def docstring_decorator(fn):
 def gpt2Tokenizer(*args, **kwargs):
     """
     Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
-	Peculiarities:
+    Peculiarities:
         - Byte-level BPE
 
     Args:
     pretrained_model_name_or_path: Path to pretrained model archive
                                    or one of pre-trained vocab configs below.
-                                       * openai-gpt
+                                       * gpt2
     Keyword args:
-	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-					Default: None
-	max_len: An artificial maximum length to truncate tokenized sequences to;
-        	 Effective maximum length is always the minimum of this
+    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
              value (if specified) and the underlying BERT model's
              sequence length.
-			 Default: None
+             Default: None
 
     Example:
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
-		
-		>>> text = "Who was Jim Henson ?"
+
+        >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
     """
     tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
@@ -66,31 +66,31 @@ def gpt2Tokenizer(*args, **kwargs):
 def gpt2Model(*args, **kwargs):
     """
     gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
+    identical stacked masked self-attention blocks and pre-trained
+    on large scale dataset using language modeling signal.
 
     Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
-		>>> text_2 = "Jim Henson was a puppeteer"
+        >>> text_2 = "Jim Henson was a puppeteer"
         >>> indexed_tokens_1 = tokenizer.encode(text_1)
         >>> indexed_tokens_2 = tokenizer.encode(text_2)
-		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2Model
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
-		# past can be used to reuse precomputed hidden state in a subsequent predictions
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
         >>> with torch.no_grad():
                 hidden_states_1, past = model(tokens_tensor_1)
-				hidden_states_2, past = model(tokens_tensor_2, past=past)
+                hidden_states_2, past = model(tokens_tensor_2, past=past)
     """
     model = GPT2Model.from_pretrained(*args, **kwargs)
     return model
@@ -100,34 +100,34 @@ def gpt2Model(*args, **kwargs):
 def gpt2LMHeadModel(*args, **kwargs):
     """
     gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-	tied (pre-trained) language modeling head on top.
+    tied (pre-trained) language modeling head on top.
 
-	Example:
+    Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
-		>>> text_2 = "Jim Henson was a puppeteer"
+        >>> text_2 = "Jim Henson was a puppeteer"
         >>> indexed_tokens_1 = tokenizer.encode(text_1)
         >>> indexed_tokens_2 = tokenizer.encode(text_2)
-		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2LMHeadModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
-		# past can be used to reuse precomputed hidden state in a subsequent predictions
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
         >>> with torch.no_grad():
-				predictions_1, past = model(tokens_tensor_1)
-				predictions_2, past = model(tokens_tensor_2, past=past)
+                predictions_1, past = model(tokens_tensor_1)
+                predictions_2, past = model(tokens_tensor_2, past=past)
 
-		# Get the predicted last token
-		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-		>>> predicted_token = tokenizer.decode([predicted_index])
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
         >>> assert predicted_token == ' who'
     """
     model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
@@ -138,19 +138,19 @@ def gpt2LMHeadModel(*args, **kwargs):
 def gpt2DoubleHeadsModel(*args, **kwargs):
     """
     gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
-	tied (pre-trained) language modeling head and a multiple choice
-	classification head (only initialized, not pre-trained).
+    tied (pre-trained) language modeling head and a multiple choice
+    classification head (only initialized, not pre-trained).
 
-	Example:
+    Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
-		>>> text = "Who was Jim Henson ?"
+        >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(text)
         >>> tokens_tensor = torch.tensor([indexed_tokens])
-		>>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ])
+        >>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ])
 
         # Load gpt2DoubleHeadsModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index 68ffb19fb5b3f9..a4fae31493180a 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,7 +1,7 @@
 from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
 from pytorch_pretrained_bert.modeling_transfo_xl import (
-	TransfoXLModel,
-	TransfoXLLMHeadModel
+    TransfoXLModel,
+    TransfoXLLMHeadModel
 )
 
 # A lot of models share the same param doc. Use a decorator
@@ -11,20 +11,20 @@
     - you don't need to specify positioning embeddings indices
     - the tokens in the vocabulary have to be sorted to decreasing frequency.
 
-	Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `transfo-xl-wt103`
-			- a path or url to a pretrained model archive containing:
-				. `transfo_xl_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-			- a path or url to a pretrained model archive containing:
-				. `transfo_xl_config.json` a configuration file for the model
-				. `model.chkpt` a TensorFlow checkpoint
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific TransformerXL class
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `transfo-xl-wt103`
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific TransformerXL class
 """
 
 
@@ -45,12 +45,12 @@ def transformerXLTokenizer(*args, **kwargs):
                                        * transfo-xl-wt103
 
     Example:
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
-		
-		>>> text = "Who was Jim Henson ?"
+        
+        >>> text = "Who was Jim Henson ?"
         >>> tokenized_text = tokenizer.tokenize(tokenized_text)
-		>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
     """
     tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -60,33 +60,33 @@ def transformerXLTokenizer(*args, **kwargs):
 def transformerXLModel(*args, **kwargs):
     """
     gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
+    identical stacked masked self-attention blocks and pre-trained
+    on large scale dataset using language modeling signal.
 
     Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
-		>>> text_2 = "Jim Henson was a puppeteer"
-		>>> tokenized_text_1 = tokenizer.tokenize(text_1)
-		>>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
         >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
         >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
-		# We can re-use the memory cells in a subsequent call to attend a longer context
+        # We can re-use the memory cells in a subsequent call to attend a longer context
         >>> with torch.no_grad():
                 hidden_states_1, mems_1 = model(tokens_tensor_1)
-				hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
     """
     model = TransfoXLModel.from_pretrained(*args, **kwargs)
     return model
@@ -96,37 +96,37 @@ def transformerXLModel(*args, **kwargs):
 def transformerXLLMHeadModel(*args, **kwargs):
     """
     gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-	tied (pre-trained) language modeling head on top.
+    tied (pre-trained) language modeling head on top.
 
-	Example:
+    Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
-		>>> text_2 = "Jim Henson was a puppeteer"
-		>>> tokenized_text_1 = tokenizer.tokenize(text_1)
-		>>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
         >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
         >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLLMHeadModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
-		# We can re-use the memory cells in a subsequent call to attend a longer context
+        # We can re-use the memory cells in a subsequent call to attend a longer context
         >>> with torch.no_grad():
                 predictions_1, mems_1 = model(tokens_tensor_1)
-				predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
 
-		# Get the predicted last token
-		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-		>>> assert predicted_token == 'who'
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        >>> assert predicted_token == 'who'
     """
     model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
     return model

From 312fdd775282fc16ef7e97f2d19ca63cdcae5424 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 1 Jun 2019 17:43:26 -0400
Subject: [PATCH 073/144] fix doc error

---
 hubconfs/transformer_xl_hubconf.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index a4fae31493180a..d5c697547e7e6e 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -59,9 +59,7 @@ def transformerXLTokenizer(*args, **kwargs):
 @_append_from_pretrained_docstring(transformer_xl_docstring)
 def transformerXLModel(*args, **kwargs):
     """
-    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-    identical stacked masked self-attention blocks and pre-trained
-    on large scale dataset using language modeling signal.
+    transformerXLModel is the basic Transformer XL model.
 
     Example:
         # Load the tokenizer
@@ -95,7 +93,7 @@ def transformerXLModel(*args, **kwargs):
 @_append_from_pretrained_docstring(transformer_xl_docstring)
 def transformerXLLMHeadModel(*args, **kwargs):
     """
-    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
+    transformerXLModel is the basic Transformer XL model with the
     tied (pre-trained) language modeling head on top.
 
     Example:

From de5e5682a12463465a9eda4d2b13efad9c50d0dd Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Mon, 3 Jun 2019 17:05:24 -0400
Subject: [PATCH 074/144] add output_attentions for BertModel

---
 pytorch_pretrained_bert/modeling.py | 34 +++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index b9b6837193dce1..72d8ff519528e2 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -275,7 +275,7 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -291,6 +291,8 @@ def __init__(self, config):
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
+        self.output_attentions = output_attentions
+
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
@@ -322,7 +324,10 @@ def forward(self, hidden_states, attention_mask):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
-        return context_layer
+        if self.output_attentions:
+            return attention_probs, context_layer
+        else:
+            return context_layer
 
 
 class BertSelfOutput(nn.Module):
@@ -381,33 +386,43 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
+        self.output_attentions = output_attentions
 
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attention_output, layer_output
         return layer_output
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
+        layer = BertLayer(config, output_attentions=output_attentions)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.output_attentions = output_attentions
 
     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
         all_encoder_layers = []
+        all_attentions = []
         for layer_module in self.layer:
             hidden_states = layer_module(hidden_states, attention_mask)
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
             if output_all_encoded_layers:
                 all_encoder_layers.append(hidden_states)
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
         return all_encoder_layers
 
 
@@ -699,12 +714,13 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertModel, self).__init__(config)
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions)
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
+        self.output_attentions = output_attentions
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
         if attention_mask is None:
@@ -731,10 +747,14 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
                                       output_all_encoded_layers=output_all_encoded_layers)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
             encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
         return encoded_layers, pooled_output
 
 

From 826496580beae08289452da0eda914bdc40a95bb Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Mon, 3 Jun 2019 17:10:25 -0400
Subject: [PATCH 075/144] Revert "add output_attentions for BertModel"

This reverts commit de5e5682a12463465a9eda4d2b13efad9c50d0dd.
---
 pytorch_pretrained_bert/modeling.py | 34 ++++++-----------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 72d8ff519528e2..b9b6837193dce1 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -275,7 +275,7 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -291,8 +291,6 @@ def __init__(self, config, output_attentions=False):
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-        self.output_attentions = output_attentions
-
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
@@ -324,10 +322,7 @@ def forward(self, hidden_states, attention_mask):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
-        if self.output_attentions:
-            return attention_probs, context_layer
-        else:
-            return context_layer
+        return context_layer
 
 
 class BertSelfOutput(nn.Module):
@@ -386,43 +381,33 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
-        self.output_attentions = output_attentions
 
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
-        if self.output_attentions:
-            return attention_output, layer_output
         return layer_output
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertEncoder, self).__init__()
-        layer = BertLayer(config, output_attentions=output_attentions)
+        layer = BertLayer(config)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
-        self.output_attentions = output_attentions
 
     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
         all_encoder_layers = []
-        all_attentions = []
         for layer_module in self.layer:
             hidden_states = layer_module(hidden_states, attention_mask)
-            if self.output_attentions:
-                attentions, hidden_states = hidden_states
-                all_attentions.append(attentions)
             if output_all_encoded_layers:
                 all_encoder_layers.append(hidden_states)
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
-        if self.output_attentions:
-            return all_attentions, all_encoder_layers
         return all_encoder_layers
 
 
@@ -714,13 +699,12 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertModel, self).__init__(config)
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config, output_attentions=output_attentions)
+        self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
-        self.output_attentions = output_attentions
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
         if attention_mask is None:
@@ -747,14 +731,10 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
                                       output_all_encoded_layers=output_all_encoded_layers)
-        if self.output_attentions:
-            all_attentions, encoded_layers = encoded_layers
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
             encoded_layers = encoded_layers[-1]
-        if self.output_attentions:
-            return all_attentions, encoded_layers, pooled_output
         return encoded_layers, pooled_output
 
 

From a3274ac40b14025ee857897ecfaff4fb07bcb61d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 3 Jun 2019 16:11:45 -0500
Subject: [PATCH 076/144] adding attention outputs in bert

---
 pytorch_pretrained_bert/modeling.py | 43 +++++++++++++++++++++++------
 tests/modeling_gpt2_test.py         | 34 +++++++++++++++++++++++
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index b9b6837193dce1..27682eb369b845 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -275,12 +275,13 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -322,6 +323,8 @@ def forward(self, hidden_states, attention_mask):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
         return context_layer
 
 
@@ -340,14 +343,19 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions)
         self.output = BertSelfOutput(config)
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
         attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
         return attention_output
 
 
@@ -381,33 +389,45 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
         return layer_output
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
         all_encoder_layers = []
+        all_attentions = []
         for layer_module in self.layer:
             hidden_states = layer_module(hidden_states, attention_mask)
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
             if output_all_encoded_layers:
                 all_encoder_layers.append(hidden_states)
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
         return all_encoder_layers
 
 
@@ -699,10 +719,11 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertModel, self).__init__(config)
+        self.output_attentions = output_attentions
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions)
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
 
@@ -731,10 +752,14 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
                                       output_all_encoded_layers=output_all_encoded_layers)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
             encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
         return encoded_layers, pooled_output
 
 
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 6804b794c50da0..41cc9b8fd3e1cf 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -133,11 +133,28 @@ def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
             }
             return outputs
 
+        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2LMHeadModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
         def check_gpt2_lm_head_output(self, result):
             total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(result["presents"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
 
         def check_gpt2_lm_head_loss_output(self, result):
             self.parent.assertListEqual(
@@ -160,6 +177,23 @@ def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_i
             }
             return outputs
 
+        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2DoubleHeadsModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, mc_token_ids,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
         def check_gpt2_double_heads_output(self, result):
             total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(

From cf44d9839202d4d67cdc66fbb46162904587409f Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 16:36:02 +0200
Subject: [PATCH 077/144] Add more examples to BERT models for torchhub

---
 hubconfs/bert_hubconf.py | 108 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 8 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 67397aeec8b68f..385c284b65c002 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -82,7 +82,7 @@ def bertTokenizer(*args, **kwargs):
 
     Example:
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -101,7 +101,7 @@ def bertModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -113,7 +113,7 @@ def bertModel(*args, **kwargs):
         >>> segments_tensors = torch.tensor([segments_ids])
         tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
         # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased', force_reload=False)
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
         >>> model.eval()
         # Predict hidden states features for each layer
         >>> with torch.no_grad():
@@ -129,6 +129,23 @@ def bertForNextSentencePrediction(*args, **kwargs):
     BERT model with next sentence prediction head.
     This module comprises the BERT model followed by the next sentence
     classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForNextSentencePrediction
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the next sentence classification logits
+        >>> with torch.no_grad():
+                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
     return model
@@ -154,7 +171,7 @@ def bertForMaskedLM(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -166,7 +183,7 @@ def bertForMaskedLM(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased', force_reload=False)
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
         >>> model.eval()
         # Predict all tokens
         >>> with torch.no_grad():
@@ -194,7 +211,25 @@ def bertForSequenceClassification(*args, **kwargs):
     num_labels: the number (>=2) of classes for the classifier.
 
     Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2, force_reload=True)
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForSequenceClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the sequence classification logits
+        >>> with torch.no_grad():
+                seq_classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the sequence classification loss
+        >>> labels = torch.tensor([1])
+        >>> with torch.no_grad():
+                seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
     return model
@@ -210,7 +245,25 @@ def bertForMultipleChoice(*args, **kwargs):
     num_choices: the number (>=2) of classes for the classifier.
 
     Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True)
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
+        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
+        # Load bertForMultipleChoice
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model.eval()
+        # Predict the multiple choice logits
+        >>> with torch.no_grad():
+                multiple_choice_logits = model(tokens_tensor, segments_tensors)
+        # Or get the multiple choice loss
+        >>> labels = torch.tensor([1])
+        >>> with torch.no_grad():
+                multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
     return model
@@ -222,6 +275,27 @@ def bertForQuestionAnswering(*args, **kwargs):
     BertForQuestionAnswering is a fine-tuning model that includes BertModel
     with a token-level classifiers on top of the full sequence of last hidden
     states.
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForQuestionAnswering
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the start and end positions logits
+        >>> with torch.no_grad():
+                start_logits, end_logits = model(tokens_tensor, segments_tensors)
+        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
+        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
+        >>> with torch.no_grad():
+                multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
     """
     model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
     return model
@@ -240,7 +314,25 @@ def bertForTokenClassification(*args, **kwargs):
     num_labels: the number (>=2) of classes for the classifier.
 
     Example:
-        >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2, force_reload=True)
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForTokenClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the token classification logits
+        >>> with torch.no_grad():
+                classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the token classification loss
+        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
+        >>> with torch.no_grad():
+                classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForTokenClassification.from_pretrained(*args, **kwargs)
     return model

From 2647ac3294d14ffe270db1b235f37f4990430cd6 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 16:57:40 +0200
Subject: [PATCH 078/144] forgot bertForPreTraining

---
 hubconfs/bert_hubconf.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 385c284b65c002..14e5a172399466 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -158,6 +158,19 @@ def bertForPreTraining(*args, **kwargs):
     This module comprises the BERT model followed by the two pre-training heads
         - the masked language modeling head, and
         - the next sentence classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForPreTraining
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
     return model

From 122d5c52acb70c368aa09328e12281760e01ce75 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 17:02:51 +0200
Subject: [PATCH 079/144] distinguish was is not trained

---
 hubconfs/bert_hubconf.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 14e5a172399466..7cd2a123c0f828 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -214,7 +214,8 @@ def bertForSequenceClassification(*args, **kwargs):
     """
     BertForSequenceClassification is a fine-tuning model that includes
     BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel.
+    on top of the BertModel. Note that the classification head is only initialized
+    and has to be trained.
 
     The sequence-level classifier is a linear layer that takes as input the
     last hidden state of the first character in the input sequence
@@ -252,7 +253,8 @@ def bertForSequenceClassification(*args, **kwargs):
 def bertForMultipleChoice(*args, **kwargs):
     """
     BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel.
+    linear layer on top of the BertModel. Note that the multiple choice head is
+    only initialized and has to be trained.
 
     Args:
     num_choices: the number (>=2) of classes for the classifier.
@@ -287,7 +289,8 @@ def bertForQuestionAnswering(*args, **kwargs):
     """
     BertForQuestionAnswering is a fine-tuning model that includes BertModel
     with a token-level classifiers on top of the full sequence of last hidden
-    states.
+    states. Note that the classification head is only initialized
+    and has to be trained.
 
     Example:
         # Load the tokenizer
@@ -318,7 +321,8 @@ def bertForQuestionAnswering(*args, **kwargs):
 def bertForTokenClassification(*args, **kwargs):
     """
     BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel.
+    and a token-level classifier on top of the BertModel. Note that the classification
+    head is only initialized and has to be trained.
 
     The token-level classifier is a linear layer that takes as input the last
     hidden state of the sequence.

From 6b8d227092302eff4ff6a294034c4c16b81569ba Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 17:07:03 +0200
Subject: [PATCH 080/144] some cleaning

---
 hubconfs/bert_hubconf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 7cd2a123c0f828..c7bcfbffb6114b 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -105,13 +105,10 @@ def bertModel(*args, **kwargs):
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
-        ['[CLS]', 'Who', 'was', 'Jim', 'He', '##nson', '?', '[SEP]', 'Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer', '[SEP]']
         >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
         >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
         >>> tokens_tensor = torch.tensor([indexed_tokens])
-        tensor([[101,  2627,  1108,  3104,  1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]])
         >>> segments_tensors = torch.tensor([segments_ids])
-        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
         # Load bertModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
         >>> model.eval()
@@ -190,7 +187,6 @@ def bertForMaskedLM(*args, **kwargs):
         >>> tokenized_text = tokenizer.tokenize(text)
         >>> masked_index = 8
         >>> tokenized_text[masked_index] = '[MASK]'
-        ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
         >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
         >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
         >>> tokens_tensor = torch.tensor([indexed_tokens])

From 2d07f945adfd41389b5dd45d85af37d404a09599 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 17:10:24 +0200
Subject: [PATCH 081/144] fix error with torch.no_grad and loss computation

---
 hubconfs/bert_hubconf.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index c7bcfbffb6114b..a547a33c22c88e 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -238,8 +238,7 @@ def bertForSequenceClassification(*args, **kwargs):
                 seq_classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the sequence classification loss
         >>> labels = torch.tensor([1])
-        >>> with torch.no_grad():
-                seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
     return model
@@ -273,8 +272,7 @@ def bertForMultipleChoice(*args, **kwargs):
                 multiple_choice_logits = model(tokens_tensor, segments_tensors)
         # Or get the multiple choice loss
         >>> labels = torch.tensor([1])
-        >>> with torch.no_grad():
-                multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
     return model
@@ -306,8 +304,7 @@ def bertForQuestionAnswering(*args, **kwargs):
                 start_logits, end_logits = model(tokens_tensor, segments_tensors)
         # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
         >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
-        >>> with torch.no_grad():
-                multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
     """
     model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
     return model
@@ -344,8 +341,7 @@ def bertForTokenClassification(*args, **kwargs):
                 classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the token classification loss
         >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        >>> with torch.no_grad():
-                classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
     """
     model = BertForTokenClassification.from_pretrained(*args, **kwargs)
     return model

From ee0308f79ded65dac82c53dfb03e9ff7f06aeee4 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 6 Jun 2019 17:30:49 +0200
Subject: [PATCH 082/144] fix typo

---
 hubconfs/bert_hubconf.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index a547a33c22c88e..0595bdeccb963e 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -238,7 +238,7 @@ def bertForSequenceClassification(*args, **kwargs):
                 seq_classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the sequence classification loss
         >>> labels = torch.tensor([1])
-        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
     return model
@@ -272,7 +272,7 @@ def bertForMultipleChoice(*args, **kwargs):
                 multiple_choice_logits = model(tokens_tensor, segments_tensors)
         # Or get the multiple choice loss
         >>> labels = torch.tensor([1])
-        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
     return model
@@ -304,6 +304,7 @@ def bertForQuestionAnswering(*args, **kwargs):
                 start_logits, end_logits = model(tokens_tensor, segments_tensors)
         # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
         >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
+        # set model.train() before if training this loss
         >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
     """
     model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
@@ -341,7 +342,7 @@ def bertForTokenClassification(*args, **kwargs):
                 classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the token classification loss
         >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels)
+        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForTokenClassification.from_pretrained(*args, **kwargs)
     return model

From a3a604cefbd96d6f23366b6c9c87c3e98889461c Mon Sep 17 00:00:00 2001
From: jeonsworld <37530102+jeonsworld@users.noreply.github.com>
Date: Mon, 10 Jun 2019 12:17:23 +0900
Subject: [PATCH 083/144] Update pregenerate_training_data.py

apply Whole Word Masking technique.
referred to [create_pretraining_data.py](https://github.com/google-research/bert/blob/master/create_pretraining_data.py)
---
 .../pregenerate_training_data.py              | 82 +++++++++++++------
 1 file changed, 59 insertions(+), 23 deletions(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index e6c3598a9fecac..6cb8954465703a 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -4,11 +4,11 @@
 from tempfile import TemporaryDirectory
 import shelve
 
-from random import random, randrange, randint, shuffle, choice, sample
+from random import random, randrange, randint, shuffle, choice
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 import numpy as np
 import json
-
+import collections
 
 class DocumentDatabase:
     def __init__(self, reduce_memory=False):
@@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
         else:
             trunc_tokens.pop()
 
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
 
-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
+def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
     """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
     with several refactors to clean it up and remove a lot of unnecessary variables."""
     cand_indices = []
     for (i, token) in enumerate(tokens):
         if token == "[CLS]" or token == "[SEP]":
             continue
-        cand_indices.append(i)
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word. When a word has been split into
+        # WordPieces, the first token does not have any marker and any subsequence
+        # tokens are prefixed with ##. So whenever we see the ## token, we
+        # append it to the previous set of word indexes.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
+            cand_indices[-1].append(i)
+        else:
+            cand_indices.append([i])
 
     num_to_mask = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))
     shuffle(cand_indices)
-    mask_indices = sorted(sample(cand_indices, num_to_mask))
-    masked_token_labels = []
-    for index in mask_indices:
-        # 80% of the time, replace with [MASK]
-        if random() < 0.8:
-            masked_token = "[MASK]"
-        else:
-            # 10% of the time, keep original
-            if random() < 0.5:
-                masked_token = tokens[index]
-            # 10% of the time, replace with random word
+    masked_lms = []
+    covered_indexes = set()
+    for index_set in cand_indices:
+        if len(masked_lms) >= num_to_mask:
+            break
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_mask:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if random() < 0.8:
+                masked_token = "[MASK]"
             else:
-                masked_token = choice(vocab_list)
-        masked_token_labels.append(tokens[index])
-        # Once we've saved the true label for that token, we can overwrite it with the masked version
-        tokens[index] = masked_token
+                # 10% of the time, keep original
+                if random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = choice(vocab_list)
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+            tokens[index] = masked_token
+
+    assert len(masked_lms) <= num_to_mask
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    mask_indices = [p.index for p in masked_lms]
+    masked_token_labels = [p.label for p in masked_lms]
 
     return tokens, mask_indices, masked_token_labels
 
 
 def create_instances_from_document(
         doc_database, doc_idx, max_seq_length, short_seq_prob,
-        masked_lm_prob, max_predictions_per_seq, vocab_list):
+        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
     """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
     However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
     Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
@@ -213,7 +248,7 @@ def create_instances_from_document(
                 segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
 
                 tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_list)
+                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)
 
                 instance = {
                     "tokens": tokens,
@@ -237,7 +272,8 @@ def main():
                         choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                  "bert-base-multilingual", "bert-base-chinese"])
     parser.add_argument("--do_lower_case", action="store_true")
-
+    parser.add_argument("--do_whole_word_mask", action="store_true",
+                        help="Whether to use whole word masking rather than per-WordPiece masking.")
     parser.add_argument("--reduce_memory", action="store_true",
                         help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
 
@@ -284,7 +320,7 @@ def main():
                     doc_instances = create_instances_from_document(
                         docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
                         masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
-                        vocab_list=vocab_list)
+                        whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
                     doc_instances = [json.dumps(instance) for instance in doc_instances]
                     for instance in doc_instances:
                         epoch_file.write(instance + '\n')

From 5c08c8c273f7b63fdf566bb18dd169d8538f8dea Mon Sep 17 00:00:00 2001
From: Oliver Guhr <oliver.guhr@htw-dresden.de>
Date: Tue, 11 Jun 2019 13:46:33 +0200
Subject: [PATCH 084/144] adds the tokenizer + model config to the output

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 11 +++++++++--
 examples/lm_finetuning/simple_lm_finetuning.py     |  6 +++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index cf27ef6cc6e432..2a5783c26116ac 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -1,5 +1,6 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import os
 import torch
 import logging
 import json
@@ -12,6 +13,7 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
@@ -325,8 +327,13 @@ def main():
     # Save a trained model
     logging.info("** ** * Saving fine-tuned model ** ** * ")
     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = args.output_dir / "pytorch_model.bin"
-    torch.save(model_to_save.state_dict(), str(output_model_file))
+    
+    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+    torch.save(model_to_save.state_dict(), output_model_file)
+    model_to_save.config.to_json_file(output_config_file)
+    tokenizer.save_vocabulary(args.output_dir)
 
 
 if __name__ == '__main__':
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 610912675f441f..368d6825c73c6c 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,6 +29,7 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
@@ -614,9 +615,12 @@ def main():
         # Save a trained model
         logger.info("** ** * Saving fine - tuned model ** ** * ")
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
         if args.do_train:
             torch.save(model_to_save.state_dict(), output_model_file)
+            model_to_save.config.to_json_file(output_config_file)
+            tokenizer.save_vocabulary(args.output_dir)
 
 
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):

From e02ce4dc7935582bf614eef0a480d3cb609ca062 Mon Sep 17 00:00:00 2001
From: Meet Pragnesh Shah <meetshah1995@gmail.com>
Date: Tue, 11 Jun 2019 15:13:53 -0700
Subject: [PATCH 085/144] [hotfix] Fix frozen pooler parameters in SWAG
 example.

---
 examples/run_swag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_swag.py b/examples/run_swag.py
index 5e7ac85c63c3db..59bb9866c31ef3 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -390,7 +390,7 @@ def main():
 
         # hack to remove pooler, which is not used
         # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+        param_optimizer = [n for n in param_optimizer]
 
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         optimizer_grouped_parameters = [

From bcc9e93e6f585eec96444218b61b517f3f2f6314 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 14 Jun 2019 15:38:20 +0200
Subject: [PATCH 086/144] fix test

---
 tests/modeling_gpt2_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 41cc9b8fd3e1cf..7817b988759bc7 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -152,9 +152,10 @@ def check_gpt2_lm_head_output(self, result):
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertEqual(self.n_layer, len(result["presents"]))
             self.parent.assertListEqual(
-                list(result["presents"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+                list(result["presents"][0].size()),
+                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])
 
         def check_gpt2_lm_head_loss_output(self, result):
             self.parent.assertListEqual(

From 5e1207b8ad00fd649c0f35b9697cd67ce9897505 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 14 Jun 2019 16:28:25 +0200
Subject: [PATCH 087/144] add attention to all bert models and add test

---
 pytorch_pretrained_bert/modeling.py | 116 +++++++++++++++++++---------
 tests/modeling_test.py              |  71 ++++++++++++++---
 2 files changed, 140 insertions(+), 47 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 27682eb369b845..bfcbcc9edf8e14 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -813,15 +813,20 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertForPreTraining, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
                                                    output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
         if masked_lm_labels is not None and next_sentence_label is not None:
@@ -830,8 +835,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
             return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -876,23 +882,29 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
                                        output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         prediction_scores = self.cls(sequence_output)
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             return masked_lm_loss
-        else:
-            return prediction_scores
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return prediction_scores
 
 
 class BertForNextSentencePrediction(BertPreTrainedModel):
@@ -938,23 +950,29 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.cls = BertOnlyNSPHead(config)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
                                      output_all_encoded_layers=False)
-        seq_relationship_score = self.cls( pooled_output)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)
 
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             return next_sentence_loss
-        else:
-            return seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1002,16 +1020,21 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels, output_attentions=False):
         super(BertForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
@@ -1019,8 +1042,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
 
 
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -1067,10 +1091,11 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices):
+    def __init__(self, config, num_choices, output_attentions=False):
         super(BertForMultipleChoice, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_choices = num_choices
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
@@ -1079,7 +1104,11 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
@@ -1088,8 +1117,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
             return loss
-        else:
-            return reshaped_logits
+        elif self.output_attentions:
+            return all_attentions, reshaped_logits
+        return reshaped_logits
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1137,16 +1167,21 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels, output_attentions=False):
         super(BertForTokenClassification, self).__init__(config)
+        self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
@@ -1161,8 +1196,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1212,16 +1248,19 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False):
         super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_bert_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
@@ -1243,5 +1282,6 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_pos
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
             return total_loss
-        else:
-            return start_logits, end_logits
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 5cde383fdfe76d..79993ed84020f9 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -28,7 +28,7 @@
 from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification)
+                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -56,6 +56,7 @@ def __init__(self,
                      type_sequence_label_size=2,
                      initializer_range=0.02,
                      num_labels=3,
+                     num_choices=4,
                      scope=None):
             self.parent = parent
             self.batch_size = batch_size
@@ -77,6 +78,7 @@ def __init__(self,
             self.type_sequence_label_size = type_sequence_label_size
             self.initializer_range = initializer_range
             self.num_labels = num_labels
+            self.num_choices = num_choices
             self.scope = scope
 
         def prepare_config_and_inputs(self):
@@ -92,9 +94,11 @@ def prepare_config_and_inputs(self):
 
             sequence_labels = None
             token_labels = None
+            choice_labels = None
             if self.use_labels:
                 sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = BertModelTest.ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -109,14 +113,14 @@ def prepare_config_and_inputs(self):
                 type_vocab_size=self.type_vocab_size,
                 initializer_range=self.initializer_range)
 
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
                 [])
 
-        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
             model.eval()
             all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
@@ -137,7 +141,7 @@ def check_bert_model_output(self, result):
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
 
-        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
@@ -153,7 +157,7 @@ def check_bert_for_masked_lm_output(self, result):
                 list(result["prediction_scores"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
 
-        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@@ -170,7 +174,7 @@ def check_bert_for_next_sequence_prediction_output(self, result):
                 [self.batch_size, 2])
 
 
-        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
@@ -191,7 +195,7 @@ def check_bert_for_pretraining_output(self, result):
                 [self.batch_size, 2])
 
 
-        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
@@ -212,7 +216,7 @@ def check_bert_for_question_answering_output(self, result):
                 [self.batch_size, self.seq_length])
 
 
-        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@@ -229,7 +233,7 @@ def check_bert_for_sequence_classification_output(self, result):
                 [self.batch_size, self.num_labels])
 
 
-        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForTokenClassification(config=config, num_labels=self.num_labels)
             model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
@@ -246,6 +250,49 @@ def check_bert_for_token_classification_output(self, result):
                 [self.batch_size, self.seq_length, self.num_labels])
 
 
+        def create_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForMultipleChoice(config=config, num_choices=self.num_choices)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            loss = model(multiple_choice_inputs_ids,
+                         multiple_choice_token_type_ids,
+                         multiple_choice_input_mask,
+                         choice_labels)
+            logits = model(multiple_choice_inputs_ids,
+                           multiple_choice_token_type_ids,
+                           multiple_choice_input_mask)
+            outputs = {
+                "loss": loss,
+                "logits": logits,
+            }
+            return outputs
+
+        def check_bert_for_multiple_choice(self, result):
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_choices])
+
+
+        def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config, num_labels=self.num_labels, output_attentions=True)
+                else:
+                    model = model_class(config=config, output_attentions=True)
+                model.eval()
+                output = model(input_ids, token_type_ids, input_mask)
+                attentions = output[0]
+                self.parent.assertEqual(len(attentions), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(attentions[0].size()),
+                    [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
+
+
     def test_default(self):
         self.run_tester(BertModelTest.BertModelTester(self))
 
@@ -300,6 +347,12 @@ def run_tester(self, tester):
         tester.check_bert_for_token_classification_output(output_result)
         tester.check_loss_output(output_result)
 
+        output_result = tester.create_bert_for_multiple_choice(*config_and_inputs)
+        tester.check_bert_for_multiple_choice(output_result)
+        tester.check_loss_output(output_result)
+
+        tester.create_and_check_bert_for_attentions(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""

From 44e9ddd7fe7a683994de81d1791b453cf7b0a54c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 14 Jun 2019 17:17:43 +0200
Subject: [PATCH 088/144] fix num_special_tokens in GPT 2 test

---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 396364d549b3bc..3d227a391d5cb3 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -432,6 +432,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         kwargs.pop('cache_dir', None)
         from_tf = kwargs.get('from_tf', False)
         kwargs.pop('from_tf', None)
+        num_special_tokens = kwargs.get('num_special_tokens', None)
+        kwargs.pop('num_special_tokens', None)
 
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]

From 16af9ff7b0a22abfaff24de3ae00e695d7c25dd9 Mon Sep 17 00:00:00 2001
From: timoeller <timo.moeller@deepset.ai>
Date: Fri, 14 Jun 2019 17:42:46 +0200
Subject: [PATCH 089/144] Add German Bert model to code, update readme

---
 README.md                               | 1 +
 pytorch_pretrained_bert/modeling.py     | 1 +
 pytorch_pretrained_bert/tokenization.py | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index b1cb84619de7ef..1d734f9df96b76 100644
--- a/README.md
+++ b/README.md
@@ -491,6 +491,7 @@ where
     - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
     - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
     - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index bbf8f4800b1adf..3006d8e9710fc8 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -44,6 +44,7 @@
     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased.tar.gz",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 3937d6e0118921..26c172dc69160e 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -34,6 +34,7 @@
     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-base-uncased': 512,
@@ -43,6 +44,7 @@
     'bert-base-multilingual-uncased': 512,
     'bert-base-multilingual-cased': 512,
     'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
 }
 VOCAB_NAME = 'vocab.txt'
 

From 5076a5daa718d61105afa1835b781ded8fcf83b8 Mon Sep 17 00:00:00 2001
From: Shashwath H A <shashwath94@gmail.com>
Date: Fri, 14 Jun 2019 22:03:21 -0400
Subject: [PATCH 090/144] Fix proj adp softmax output return when n_clusters=0

---
 pytorch_pretrained_bert/modeling_transfo_xl_utilities.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 647ba7774c1522..7fd67adb35879e 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -114,10 +114,10 @@ def forward(self, hidden, target=None, keep_order=False):
             logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                         self.out_layers[0].bias, self.out_projs[0])
             if target is not None:
-                output = -F.log_softmax(logit, dim=-1) \
+                out = -F.log_softmax(logit, dim=-1) \
                         .gather(1, target.unsqueeze(1)).squeeze(1)
             else:
-                output = F.log_softmax(logit, dim=-1)
+                out = F.log_softmax(logit, dim=-1)
         else:
             # construct weights and biases
             weights, biases = [], []

From 8289646d4eab2f025b1def57bb0395ae1f461619 Mon Sep 17 00:00:00 2001
From: vanche <vanche9@gmail.com>
Date: Sat, 15 Jun 2019 22:19:30 +0900
Subject: [PATCH 091/144] import class "GPT2MultipleChoiceHead"

---
 pytorch_pretrained_bert/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 99706fde4366d4..508988322ba298 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -15,7 +15,7 @@
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
                                   load_tf_weights_in_transfo_xl)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
                             load_tf_weights_in_gpt2)
 
 from .optimization import BertAdam

From 34858ae1d9e11dc51100b26ac468770c81c8afc1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 11:02:39 +0200
Subject: [PATCH 092/144] adding bert whole words, bertgerman and gpt-2 medium
 models, head masking

---
 README.md                               |  7 ++-
 pytorch_pretrained_bert/modeling.py     | 71 ++++++++++++++++---------
 pytorch_pretrained_bert/tokenization.py |  4 ++
 3 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 1d734f9df96b76..b8a4a9d5a4ebd2 100644
--- a/README.md
+++ b/README.md
@@ -492,9 +492,12 @@ where
     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
-    - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
 
   - a path or url to a pretrained model archive containing:
 
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 3006d8e9710fc8..11a7191df5d782 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -45,6 +45,8 @@
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
     'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased.tar.gz",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking.tar.gz",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking.tar.gz",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
@@ -279,13 +281,16 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -301,7 +306,7 @@ def transpose_for_scores(self, x):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask):
+    def forward(self, hidden_states, attention_mask, head_mask=None):
         mixed_query_layer = self.query(hidden_states)
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
@@ -323,7 +328,20 @@ def forward(self, hidden_states, attention_mask):
         # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
+        # Mask heads if we want to
+        # attention_probs has shape bsz x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            elif head_mask.dim() == 2:
+                head_mask.unsqueeze(-1).unsqueeze(-1)  # We can define heads to mask for each instance in the batch
+            attention_probs = attention_probs * head_mask
+
         context_layer = torch.matmul(attention_probs, value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
@@ -353,8 +371,8 @@ def __init__(self, config, output_attentions=False):
         self.self = BertSelfAttention(config, output_attentions=output_attentions)
         self.output = BertSelfOutput(config)
 
-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
         if self.output_attentions:
             attentions, self_output = self_output
         attention_output = self.output(self_output, input_tensor)
@@ -400,8 +418,8 @@ def __init__(self, config, output_attentions=False):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
         if self.output_attentions:
             attentions, attention_output = attention_output
         intermediate_output = self.intermediate(attention_output)
@@ -418,11 +436,11 @@ def __init__(self, config, output_attentions=False):
         layer = BertLayer(config, output_attentions=output_attentions)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
         all_encoder_layers = []
         all_attentions = []
         for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states, attention_mask)
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask)
             if self.output_attentions:
                 attentions, hidden_states = hidden_states
                 all_attentions.append(attentions)
@@ -731,7 +749,7 @@ def __init__(self, config, output_attentions=False):
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -755,7 +773,8 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         embedding_output = self.embeddings(input_ids, token_type_ids)
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers)
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
         if self.output_attentions:
             all_attentions, encoded_layers = encoded_layers
         sequence_output = encoded_layers[-1]
@@ -824,9 +843,9 @@ def __init__(self, config, output_attentions=False):
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False)
+                                                   output_all_encoded_layers=False, head_mask=head_mask)
         if self.output_attentions:
             all_attentions, sequence_output, pooled_output = outputs
         else:
@@ -893,9 +912,10 @@ def __init__(self, config, output_attentions=False):
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
+                                       output_all_encoded_layers=False,
+                                       head_mask=head_mask)
         if self.output_attentions:
             all_attentions, sequence_output, _ = outputs
         else:
@@ -961,9 +981,10 @@ def __init__(self, config, output_attentions=False):
         self.cls = BertOnlyNSPHead(config)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
+                                     output_all_encoded_layers=False,
+                                     head_mask=head_mask)
         if self.output_attentions:
             all_attentions, _, pooled_output = outputs
         else:
@@ -1033,8 +1054,8 @@ def __init__(self, config, num_labels=2, output_attentions=False):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
         if self.output_attentions:
             all_attentions, _, pooled_output = outputs
         else:
@@ -1104,11 +1125,11 @@ def __init__(self, config, num_choices=2, output_attentions=False):
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
         if self.output_attentions:
             all_attentions, _, pooled_output = outputs
         else:
@@ -1180,8 +1201,8 @@ def __init__(self, config, num_labels=2, output_attentions=False):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
         if self.output_attentions:
             all_attentions, sequence_output, _ = outputs
         else:
@@ -1259,8 +1280,10 @@ def __init__(self, config, output_attentions=False):
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                                       output_all_encoded_layers=False,
+                                                       head_mask=head_mask)
         if self.output_attentions:
             all_attentions, sequence_output, _ = outputs
         else:
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 26c172dc69160e..9a700cef0f1f3b 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -35,6 +35,8 @@
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
     'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-base-uncased': 512,
@@ -45,6 +47,8 @@
     'bert-base-multilingual-cased': 512,
     'bert-base-chinese': 512,
     'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
 }
 VOCAB_NAME = 'vocab.txt'
 

From 96c4d3d9885a09340a10869949c7c9bea4bfb5c4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 12:17:26 +0200
Subject: [PATCH 093/144] add head masking tests

---
 pytorch_pretrained_bert/modeling.py | 128 ++++++++++++++++++++++------
 tests/modeling_test.py              |  42 +++++++++
 2 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 11a7191df5d782..950f96744ced5d 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -51,6 +51,32 @@
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
+def prune_linear_layer(layer, index, dim=-1):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    dim = (dim+100) % 2
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -329,12 +355,7 @@ def forward(self, hidden_states, attention_mask, head_mask=None):
         attention_probs = self.dropout(attention_probs)
 
         # Mask heads if we want to
-        # attention_probs has shape bsz x n_heads x N x N
         if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-            elif head_mask.dim() == 2:
-                head_mask.unsqueeze(-1).unsqueeze(-1)  # We can define heads to mask for each instance in the batch
             attention_probs = attention_probs * head_mask
 
         context_layer = torch.matmul(attention_probs, value_layer)
@@ -365,12 +386,28 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertAttention, self).__init__()
         self.output_attentions = output_attentions
-        self.self = BertSelfAttention(config, output_attentions=output_attentions)
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
         self.output = BertSelfOutput(config)
 
+    def prune_heads(self, heads):
+        mask = torch.ones(self.self.n_heads, self.self.d_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=0)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
     def forward(self, input_tensor, attention_mask, head_mask=None):
         self_output = self.self(input_tensor, attention_mask, head_mask)
         if self.output_attentions:
@@ -411,10 +448,11 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertLayer, self).__init__()
         self.output_attentions = output_attentions
-        self.attention = BertAttention(config, output_attentions=output_attentions)
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                               keep_multihead_output=keep_multihead_output)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
@@ -430,10 +468,11 @@ def forward(self, hidden_states, attention_mask, head_mask=None):
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertEncoder, self).__init__()
         self.output_attentions = output_attentions
-        layer = BertLayer(config, output_attentions=output_attentions)
+        layer = BertLayer(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
@@ -741,14 +780,28 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertModel, self).__init__(config)
         self.output_attentions = output_attentions
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config, output_attentions=output_attentions)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions,
+                                           keep_multihead_output=keep_multihead_output)
         self.pooler = BertPooler(config)
         self.apply(self.init_bert_weights)
 
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -770,6 +823,17 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
+        # Prepare head mask if needed
+        # 1 in head_mask indicate we need to mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+
         embedding_output = self.embeddings(input_ids, token_type_ids)
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
@@ -836,10 +900,11 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForPreTraining, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
@@ -905,10 +970,11 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForMaskedLM, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
@@ -974,10 +1040,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForNextSentencePrediction, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyNSPHead(config)
         self.apply(self.init_bert_weights)
 
@@ -1045,11 +1112,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
         super(BertForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
@@ -1116,11 +1184,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices=2, output_attentions=False):
+    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
         super(BertForMultipleChoice, self).__init__(config)
         self.output_attentions = output_attentions
         self.num_choices = num_choices
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
@@ -1192,11 +1261,12 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
         super(BertForTokenClassification, self).__init__(config)
         self.output_attentions = output_attentions
         self.num_labels = num_labels
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
@@ -1273,14 +1343,16 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(BertForQuestionAnswering, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_bert_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
                                                        output_all_encoded_layers=False,
                                                        head_mask=head_mask)
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 79993ed84020f9..4c78ead7679933 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -293,6 +293,47 @@ def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids
                     [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
 
 
+        def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.num_attention_heads).to(input_ids.device)
+                head_mask[0] = 0.0
+                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+
     def test_default(self):
         self.run_tester(BertModelTest.BertModelTester(self))
 
@@ -352,6 +393,7 @@ def run_tester(self, tester):
         tester.check_loss_output(output_result)
 
         tester.create_and_check_bert_for_attentions(*config_and_inputs)
+        tester.create_and_check_bert_for_headmasking(*config_and_inputs)
 
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):

From 8415a38b23502d8fafdfa40cc082f9482645d968 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 13:03:48 +0200
Subject: [PATCH 094/144] better error messages

---
 pytorch_pretrained_bert/modeling.py           | 19 ++++++++++-------
 pytorch_pretrained_bert/modeling_gpt2.py      | 19 ++++++++++-------
 pytorch_pretrained_bert/modeling_openai.py    | 19 ++++++++++-------
 .../modeling_transfo_xl.py                    | 21 ++++++++++++-------
 pytorch_pretrained_bert/tokenization.py       | 19 ++++++++++-------
 pytorch_pretrained_bert/tokenization_gpt2.py  | 21 ++++++++++++-------
 .../tokenization_openai.py                    | 21 ++++++++++++-------
 .../tokenization_transfo_xl.py                | 21 ++++++++++++-------
 8 files changed, 100 insertions(+), 60 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 950f96744ced5d..b40e5825b37aee 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -646,13 +646,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
             return None
         if resolved_archive_file == archive_file:
             logger.info("loading archive file {}".format(archive_file))
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 3d227a391d5cb3..02bd44318b831e 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -446,14 +446,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file, config_file
+                    )
                 )
-            )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 2b448035842742..333cd88f65c5ba 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -472,14 +472,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None,
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file, config_file
+                    )
                 )
-            )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index e70a29af57b6b5..12a1535ff641af 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -926,14 +926,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    archive_file, config_file))
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        archive_file, config_file))
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 9a700cef0f1f3b..1aa4c01bde4ec3 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -181,13 +181,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        vocab_file))
             return None
         if resolved_vocab_file == vocab_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index af75cac4dc231e..78f7f59d656e20 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -113,14 +113,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
             return None
         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index c68e247e1e1fe2..52d735efa812ae 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -101,14 +101,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
             return None
         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index ddebc57c1068e6..6a882e0a7f9c4b 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -71,14 +71,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file))
             return None
         if resolved_vocab_file == vocab_file:
             logger.info("loading vocabulary file {}".format(vocab_file))

From 7220d47a1c0d6b6c535e27bd1392a885eea842fd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 13:20:45 +0200
Subject: [PATCH 095/144] adding head pruning and tests

---
 pytorch_pretrained_bert/modeling.py |  7 +++--
 tests/modeling_test.py              | 42 +++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index b40e5825b37aee..9cf02d363c2704 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -51,12 +51,11 @@
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
-def prune_linear_layer(layer, index, dim=-1):
+def prune_linear_layer(layer, index, dim=0):
     """ Prune a linear layer (a model parameters) to keep only entries in index.
         Return the pruned layer as a new layer with requires_grad=True.
         Used to remove heads.
     """
-    dim = (dim+100) % 2
     index = index.to(layer.weight.device)
     W = layer.weight.index_select(dim, index).clone().detach()
     if layer.bias is not None:
@@ -394,7 +393,7 @@ def __init__(self, config, output_attentions=False, keep_multihead_output=False)
         self.output = BertSelfOutput(config)
 
     def prune_heads(self, heads):
-        mask = torch.ones(self.self.n_heads, self.self.d_head)
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
         for head in heads:
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
@@ -403,7 +402,7 @@ def prune_heads(self, heads):
         self.self.query = prune_linear_layer(self.self.query, index)
         self.self.key = prune_linear_layer(self.self.key, index)
         self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=0)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
         # Update hyper params
         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 4c78ead7679933..b23edf1aea5afd 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -334,6 +334,47 @@ def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_id
                     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
 
 
+        def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                bert_model = model if isinstance(model, BertModel) else model.bert
+                heads_to_prune = {0: list(range(1, self.num_attention_heads)),
+                                  -1: [0]}
+                bert_model.prune_heads(heads_to_prune)
+                output = model(input_ids, token_type_ids, input_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = bert_model.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, 1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads-1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+
+
     def test_default(self):
         self.run_tester(BertModelTest.BertModelTester(self))
 
@@ -394,6 +435,7 @@ def run_tester(self, tester):
 
         tester.create_and_check_bert_for_attentions(*config_and_inputs)
         tester.create_and_check_bert_for_headmasking(*config_and_inputs)
+        tester.create_and_check_bert_for_head_pruning(*config_and_inputs)
 
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):

From b860e47cf5a61b76b480657504de2588a9385b53 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 14:12:10 +0200
Subject: [PATCH 096/144] add head masking and pruning to gpt-2

---
 pytorch_pretrained_bert/modeling_gpt2.py | 123 +++++++++++++++++++----
 tests/modeling_gpt2_test.py              |  70 +++++++++++++
 2 files changed, 172 insertions(+), 21 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 02bd44318b831e..e9fc1c5f98a08c 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -44,6 +44,30 @@
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                  "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0])
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+
+
 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -223,7 +247,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -232,13 +256,31 @@ def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+
         self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
-    def _attn(self, q, k, v):
+    def prune_heads(self, heads):
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -248,6 +290,11 @@ def _attn(self, q, k, v):
 
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
         if self.output_attentions:
             return w, torch.matmul(w, v)
         return torch.matmul(w, v)
@@ -265,7 +312,7 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
 
-    def forward(self, x, layer_past=None):
+    def forward(self, x, layer_past=None, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
@@ -276,7 +323,12 @@ def forward(self, x, layer_past=None):
             key = torch.cat((past_key, key), dim=-1)
             value = torch.cat((past_value, value), dim=-2)
         present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
         if self.output_attentions:
             attentions, a = a
         a = self.merge_heads(a)
@@ -303,17 +355,17 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Block, self).__init__()
         nx = config.n_embd
         self.output_attentions = output_attentions
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
-    def forward(self, x, layer_past=None):
-        output_attn = self.attn(self.ln_1(x), layer_past=layer_past)
+    def forward(self, x, layer_past=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
         if self.output_attentions:
             attentions, a, present = output_attn
         else:
@@ -593,13 +645,14 @@ class GPT2Model(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2Model, self).__init__(config)
         self.output_attentions = output_attentions
         self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
@@ -619,7 +672,20 @@ def set_num_special_tokens(self, num_special_tokens):
         # Copy word embeddings from the previous weights
         self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -629,6 +695,17 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
             position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
@@ -646,11 +723,12 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         presents = []
         all_attentions = []
         for block, layer_past in zip(self.h, past):
+            outputs = block(hidden_states, layer_past, head_mask)
             if self.output_attentions:
-                attentions, hidden_states, present = block(hidden_states, layer_past)
+                attentions, hidden_states, present = outputs
                 all_attentions.append(attentions)
             else:
-                hidden_states, present = block(hidden_states, layer_past)
+                hidden_states, present = outputs
             presents.append(present)
         hidden_states = self.ln_f(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
@@ -703,9 +781,10 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
@@ -717,8 +796,8 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states, presents = transformer_output
         else:
@@ -787,9 +866,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
@@ -802,8 +882,9 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states, presents = transformer_output
         else:
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 7817b988759bc7..aaa88d54e8c983 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -209,6 +209,73 @@ def check_gpt2_double_heads_loss_output(self, result):
                 [list(l.size()) for l in result["loss"]],
                 [[], []])
 
+        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.n_head).to(input_ids.device)
+                head_mask[0] = 0.0
+                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                   mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, GPT2Model) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
     def test_default(self):
         self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
 
@@ -247,6 +314,9 @@ def run_tester(self, tester):
         tester.check_gpt2_double_heads_output(output_result)
         tester.check_gpt2_double_heads_loss_output(output_result)
 
+        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
+        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""

From f12007e4216e2bdb278b40c731eb62626181cd28 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 14:19:40 +0200
Subject: [PATCH 097/144] add head masking and pruning to openai GPT

---
 pytorch_pretrained_bert/modeling_openai.py | 100 ++++++++++++++++-----
 tests/modeling_openai_test.py              |  70 +++++++++++++++
 2 files changed, 149 insertions(+), 21 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 333cd88f65c5ba..bbc60ffd2c57f4 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -36,6 +36,7 @@
 
 from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 from .modeling import BertLayerNorm as LayerNorm
+from .modeling_gpt2 import prune_conv1d_layer
 
 logger = logging.getLogger(__name__)
 
@@ -256,7 +257,7 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -265,13 +266,31 @@ def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
+
         self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
         self.c_attn = Conv1D(n_state * 3, 1, nx)
         self.c_proj = Conv1D(n_state, 1, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
-    def _attn(self, q, k, v):
+    def prune_heads(self, heads):
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -282,6 +301,11 @@ def _attn(self, q, k, v):
 
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
         if self.output_attentions:
             return w, torch.matmul(w, v)
         return torch.matmul(w, v)
@@ -299,13 +323,18 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)
 
-    def forward(self, x):
+    def forward(self, x, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
         if self.output_attentions:
             attentions, a = a
         a = self.merge_heads(a)
@@ -332,17 +361,17 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
         super(Block, self).__init__()
         nx = config.n_embd
         self.output_attentions = output_attentions
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
-    def forward(self, x):
-        a = self.attn(x)
+    def forward(self, x, head_mask=None):
+        a = self.attn(x, head_mask=head_mask)
         if self.output_attentions:
             attentions, a = a
         n = self.ln_1(x + a)
@@ -614,13 +643,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTModel, self).__init__(config)
         self.output_attentions = output_attentions
         self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
@@ -639,7 +669,20 @@ def set_num_special_tokens(self, num_special_tokens):
         # Copy word embeddings from the previous weights
         self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
             # start = self.config.vocab_size + self.config.n_special
@@ -648,6 +691,17 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
             position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
@@ -664,11 +718,12 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None):
 
         all_attentions = []
         for block in self.h:
+            outputs = block(hidden_states, head_mask)
             if self.output_attentions:
-                attentions, hidden_states = block(hidden_states)
+                attentions, hidden_states = outputs
                 all_attentions.append(attentions)
             else:
-                hidden_states = block(hidden_states)
+                hidden_states = outputs
         output_shape = input_shape + (hidden_states.size(-1),)
         if self.output_attentions:
             return all_attentions, hidden_states.view(*output_shape)
@@ -731,9 +786,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
@@ -745,8 +801,8 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states = hidden_states
         lm_logits = self.lm_head(hidden_states)
@@ -825,9 +881,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
@@ -840,8 +897,9 @@ def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states = hidden_states
         lm_logits = self.lm_head(hidden_states)
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 4e7d9d542b0ba4..08353cdd18b7fa 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -182,6 +182,73 @@ def check_openai_double_heads_loss_output(self, result):
                 [list(l.size()) for l in result["loss"]],
                 [[], []])
 
+        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.ones(self.n_head).to(input_ids.device)
+                head_mask[0] = 0.0
+                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                     mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
     def test_default(self):
         self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
 
@@ -220,6 +287,9 @@ def run_tester(self, tester):
         tester.check_openai_double_heads_output(output_result)
         tester.check_openai_double_heads_loss_output(output_result)
 
+        tester.create_and_check_openai_for_headmasking(*config_and_inputs)
+        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
+
     @classmethod
     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
         """Creates a random int32 tensor of the shape within the vocab size."""

From 965f172de6aa52369500a4c6bc76244f69272c0f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 14:34:12 +0200
Subject: [PATCH 098/144] output all hidden layers states in GPT/GPT-2

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 15 ++++++++++++---
 pytorch_pretrained_bert/modeling_openai.py | 14 +++++++++++---
 tests/modeling_gpt2_test.py                | 13 ++++++++++---
 tests/modeling_openai_test.py              | 13 ++++++++++---
 4 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index e9fc1c5f98a08c..9240ea2bd018dd 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -720,9 +720,13 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
         hidden_states = self.drop(hidden_states)
 
+        output_shape = input_shape + (hidden_states.size(-1),)
+
         presents = []
         all_attentions = []
+        all_hidden_states = []
         for block, layer_past in zip(self.h, past):
+            all_hidden_states.append(hidden_states.view(*output_shape))
             outputs = block(hidden_states, layer_past, head_mask)
             if self.output_attentions:
                 attentions, hidden_states, present = outputs
@@ -731,10 +735,11 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
                 hidden_states, present = outputs
             presents.append(present)
         hidden_states = self.ln_f(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
+        all_hidden_states.append(hidden_states.view(*output_shape))
+
         if self.output_attentions:
-            return all_attentions, hidden_states.view(*output_shape), presents
-        return hidden_states.view(*output_shape), presents
+            return all_attentions, all_hidden_states, presents
+        return all_hidden_states, presents
 
 
 class GPT2LMHeadModel(GPT2PreTrainedModel):
@@ -802,6 +807,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
             all_attentions, hidden_states, presents = transformer_output
         else:
             hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]
+
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
@@ -889,6 +896,8 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
             all_attentions, hidden_states, presents = transformer_output
         else:
             hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]
+
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index bbc60ffd2c57f4..32c0978dd0f22d 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -716,7 +716,10 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
         hidden_states = self.drop(hidden_states)
 
+        output_shape = input_shape + (hidden_states.size(-1),)
+
         all_attentions = []
+        all_hidden_states = [hidden_states.view(*output_shape)]
         for block in self.h:
             outputs = block(hidden_states, head_mask)
             if self.output_attentions:
@@ -724,10 +727,11 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
                 all_attentions.append(attentions)
             else:
                 hidden_states = outputs
-        output_shape = input_shape + (hidden_states.size(-1),)
+            all_hidden_states.append(hidden_states.view(*output_shape))
+
         if self.output_attentions:
-            return all_attentions, hidden_states.view(*output_shape)
-        return hidden_states.view(*output_shape)
+            return all_attentions, all_hidden_states
+        return all_hidden_states
 
 
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
@@ -805,6 +809,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]
+
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
@@ -902,6 +908,8 @@ def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         if self.transformer.output_attentions:
             all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]
+
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index aaa88d54e8c983..c9a7a64b5ac0b3 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -115,8 +115,9 @@ def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
             return outputs
 
         def check_gpt2_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
             self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
 
 
@@ -222,7 +223,10 @@ def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_id
                 else:
                     output = model(input_ids, head_mask=head_mask)
 
-                output = sum(t.sum() for t in output[:-1])
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
                 output = output.sum()
                 output.backward()
                 multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
@@ -256,7 +260,10 @@ def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_i
                 else:
                     output = model(input_ids)
 
-                output = sum(t.sum() for t in output[:-1])
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
                 output = output.sum()
                 output.backward()
                 multihead_outputs = transformer.get_multihead_outputs()
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 08353cdd18b7fa..86234e57caf5a7 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -125,8 +125,9 @@ def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
             return outputs
 
         def check_openai_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
             self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
 
 
@@ -195,7 +196,10 @@ def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_
                 else:
                     output = model(input_ids, head_mask=head_mask)
 
-                output = sum(t.sum() for t in output[:-1])
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
                 output = output.sum()
                 output.backward()
                 multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
@@ -229,7 +233,10 @@ def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type
                 else:
                     output = model(input_ids)
 
-                output = sum(t.sum() for t in output[:-1])
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
                 output = output.sum()
                 output.backward()
                 multihead_outputs = transformer.get_multihead_outputs()

From 33d3db5c4306e19c65d44fe52b48e65925289e8d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 15:51:28 +0200
Subject: [PATCH 099/144] updating head masking, readme and docstrings

---
 README.md                                  | 35 +++++++++-
 pytorch_pretrained_bert/modeling.py        | 78 ++++++++++++++++++----
 pytorch_pretrained_bert/modeling_gpt2.py   | 37 +++++++---
 pytorch_pretrained_bert/modeling_openai.py | 37 +++++++---
 tests/modeling_gpt2_test.py                | 25 ++++++-
 tests/modeling_openai_test.py              | 26 +++++++-
 tests/modeling_test.py                     | 25 ++++++-
 7 files changed, 220 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index b8a4a9d5a4ebd2..fa186adf1ededf 100644
--- a/README.md
+++ b/README.md
@@ -474,7 +474,7 @@ Here is a detailed documentation of the classes in the package and how to use th
 To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
 
 ```python
-model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
+model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
 ```
 
 where
@@ -505,7 +505,12 @@ where
     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
   If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
+- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
+- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
 
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
 
@@ -631,6 +636,13 @@ These configuration classes contains a few utilities to load and save configurat
 
 `BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `BertConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -639,6 +651,7 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 This model *outputs* a tuple composed of:
 
@@ -756,6 +769,13 @@ where total_tokens_embeddings can be obtained as config.total_tokens_embeddings
     `total_tokens_embeddings = config.vocab_size + config.n_special`
 You should use the associate indices to index the embeddings.
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `OpenAIConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -766,9 +786,10 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 This model *outputs*:
-- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
 #### 10. `OpenAIGPTLMHeadModel`
 
@@ -848,6 +869,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]
 
 `GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
 
+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `GPT2Config` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
@@ -859,9 +887,10 @@ We detail them here. This model takes as *inputs*:
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
 - `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 This model *outputs*:
-- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
 
 #### 15. `GPT2LMHeadModel`
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 9cf02d363c2704..fa2cba9aa7e0ce 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -477,8 +477,8 @@ def __init__(self, config, output_attentions=False, keep_multihead_output=False)
     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
         all_encoder_layers = []
         all_attentions = []
-        for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states, attention_mask, head_mask)
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
             if self.output_attentions:
                 attentions, hidden_states = hidden_states
                 all_attentions.append(attentions)
@@ -618,6 +618,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                     . `bert-base-multilingual-uncased`
                     . `bert-base-multilingual-cased`
                     . `bert-base-chinese`
+                    . `bert-base-german-cased`
+                    . `bert-large-uncased-whole-word-masking`
+                    . `bert-large-cased-whole-word-masking`
                 - a path or url to a pretrained model archive containing:
                     . `bert_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
@@ -744,7 +747,10 @@ class BertModel(BertPreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -758,6 +764,9 @@ class BertModel(BertPreTrainedModel):
             input sequence length in the current batch. It's the mask that we typically use for attention when
             a batch has varying length sentences.
         `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
 
     Outputs: Tuple of (encoded_layers, pooled_output)
         `encoded_layers`: controled by `output_all_encoded_layers` argument:
@@ -828,15 +837,19 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
-        # 1 in head_mask indicate we need to mask the head
+        # 1.0 in head_mask indicate we mask the head
         # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape num_hidden_layers x batch x n_heads x N x N
         if head_mask is not None:
             if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
             head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
         encoded_layers = self.encoder(embedding_output,
@@ -861,7 +874,10 @@ class BertForPreTraining(BertPreTrainedModel):
         - the next sentence classification head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -880,6 +896,8 @@ class BertForPreTraining(BertPreTrainedModel):
         `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
             with indices selected in [0, 1].
             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `masked_lm_labels` and `next_sentence_label` are not `None`:
@@ -937,7 +955,10 @@ class BertForMaskedLM(BertPreTrainedModel):
     This module comprises the BERT model followed by the masked language modeling head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -953,6 +974,12 @@ class BertForMaskedLM(BertPreTrainedModel):
         `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `masked_lm_labels` is  not `None`:
@@ -1006,7 +1033,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     This module comprises the BERT model followed by the next sentence classification head.
 
     Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -1022,6 +1052,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
             with indices selected in [0, 1].
             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `next_sentence_label` is not `None`:
@@ -1077,7 +1109,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
     the pooled output.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1093,6 +1128,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -1150,7 +1187,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
     the pooled output.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_choices`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1166,6 +1206,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -1226,7 +1268,10 @@ class BertForTokenClassification(BertPreTrainedModel):
     the full hidden state of the last layer.
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1242,6 +1287,8 @@ class BertForTokenClassification(BertPreTrainedModel):
             a batch has varying length sentences.
         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `labels` is not `None`:
@@ -1306,7 +1353,10 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     the sequence output that computes start_logits and end_logits
 
     Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -1325,6 +1375,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
             into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `start_positions` and `end_positions` are not `None`:
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 9240ea2bd018dd..e3ae47402eacf4 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -610,7 +610,10 @@ class GPT2Model(GPT2PreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -625,10 +628,12 @@ class GPT2Model(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs a tuple consisting of:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
             (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
         `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
             torch.FloatTensors. They can be reused to speed up sequential decoding.
@@ -698,13 +703,17 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we mask the head
         # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
         if head_mask is not None:
             if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
             head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.n_layer
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -725,9 +734,9 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
         presents = []
         all_attentions = []
         all_hidden_states = []
-        for block, layer_past in zip(self.h, past):
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
             all_hidden_states.append(hidden_states.view(*output_shape))
-            outputs = block(hidden_states, layer_past, head_mask)
+            outputs = block(hidden_states, layer_past, head_mask[i])
             if self.output_attentions:
                 attentions, hidden_states, present = outputs
                 all_attentions.append(attentions)
@@ -746,7 +755,10 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
 
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -764,6 +776,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` is not `None`:
@@ -828,7 +842,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
 
     Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@@ -850,6 +867,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
             (key and values in the attention blocks) to speed up sequential decoding
             (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` and `multiple_choice_labels` are not `None`:
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 32c0978dd0f22d..257c2f985c2cfb 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -613,7 +613,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -625,10 +628,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             (the previous two being the word and position embeddings).
             The input, position and token_type embeddings are summed inside the Transformer before the first
             self-attention block.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
             (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
     Example usage:
@@ -694,13 +699,17 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we mask the head
         # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
         if head_mask is not None:
             if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
             head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.n_layer
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -720,8 +729,8 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
 
         all_attentions = []
         all_hidden_states = [hidden_states.view(*output_shape)]
-        for block in self.h:
-            outputs = block(hidden_states, head_mask)
+        for i, block in enumerate(self.h):
+            outputs = block(hidden_states, head_mask[i])
             if self.output_attentions:
                 attentions, hidden_states = outputs
                 all_attentions.append(attentions)
@@ -755,7 +764,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -770,6 +782,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` is not `None`:
@@ -847,7 +861,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     You should use the associate indices to index the embeddings.
 
     Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@@ -866,6 +883,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             is only computed for the labels set in [0, ..., total_tokens_embeddings]
         `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:
         if `lm_labels` and `multiple_choice_labels` are not `None`:
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index c9a7a64b5ac0b3..122b9c7913ccfa 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -215,9 +215,9 @@ def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_id
             for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
                 model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.ones(self.n_head).to(input_ids.device)
-                head_mask[0] = 0.0
-                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
                 if isinstance(model, GPT2DoubleHeadsModel):
                     output = model(input_ids, mc_token_ids, head_mask=head_mask)
                 else:
@@ -246,6 +246,25 @@ def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_id
                     len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
                     self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
 
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
         def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
                                                    mc_labels, lm_labels, mc_token_ids):
             for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 86234e57caf5a7..adb3671ef5f033 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -188,9 +188,9 @@ def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_
             for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
                 model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.ones(self.n_head).to(input_ids.device)
-                head_mask[0] = 0.0
-                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
                 if isinstance(model, OpenAIGPTDoubleHeadsModel):
                     output = model(input_ids, mc_token_ids, head_mask=head_mask)
                 else:
@@ -219,6 +219,26 @@ def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_
                     len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
                     self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
 
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+
         def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
                                                      mc_labels, lm_labels, mc_token_ids):
             for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index b23edf1aea5afd..80bb3d3c95688e 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -305,9 +305,9 @@ def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_id
                 else:
                     model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.ones(self.num_attention_heads).to(input_ids.device)
-                head_mask[0] = 0.0
-                head_mask[-1] = 0.0  # Mask all but the first and last heads
+                head_mask = torch.zeros(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
                 output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
 
                 if isinstance(model, BertModel):
@@ -333,6 +333,25 @@ def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_id
                     len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
                     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
 
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
 
         def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,

From 4447f270b2ad30eeb374b5171913f3050340c506 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 17 Jun 2019 16:21:28 +0200
Subject: [PATCH 100/144] updating hub

---
 README.md                | 45 ++++++++++++++++++++++++++++++++++++++++
 hubconfs/bert_hubconf.py | 12 +++++++++++
 hubconfs/gpt2_hubconf.py | 14 ++++++++-----
 hubconfs/gpt_hubconf.py  | 17 ++++++++-------
 4 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index fa186adf1ededf..9ddeba808efbbe 100644
--- a/README.md
+++ b/README.md
@@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == '.</w>'
 ```
 
+And how to use `OpenAIGPTDoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+```
+
 ### Transformer-XL
 
 Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
@@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
 predicted_token = tokenizer.decode([predicted_index])
 ```
 
+And how to use `GPT2DoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
+```
+
+
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 0595bdeccb963e..3769c2567f51cc 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -23,6 +23,9 @@
                 . `bert-base-multilingual-uncased`
                 . `bert-base-multilingual-cased`
                 . `bert-base-chinese`
+                . `bert-base-german-cased`
+                . `bert-large-uncased-whole-word-masking`
+                . `bert-large-cased-whole-word-masking`
             - a path or url to a pretrained model archive containing:
                 . `bert_config.json` a configuration file for the model
                 . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
@@ -81,6 +84,7 @@ def bertTokenizer(*args, **kwargs):
                  Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
 
     Example:
+        >>> import torch
         >>> sentence = 'Hello, World!'
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         >>> toks = tokenizer.tokenize(sentence)
@@ -101,6 +105,7 @@ def bertModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -129,6 +134,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -158,6 +164,7 @@ def bertForPreTraining(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -181,6 +188,7 @@ def bertForMaskedLM(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -222,6 +230,7 @@ def bertForSequenceClassification(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -256,6 +265,7 @@ def bertForMultipleChoice(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -288,6 +298,7 @@ def bertForQuestionAnswering(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -326,6 +337,7 @@ def bertForTokenClassification(*args, **kwargs):
 
     Example:
         # Load the tokenizer
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 26b53e8b0333f4..3ac8bc72ab57fc 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -11,7 +11,7 @@
     Params:
         pretrained_model_name_or_path: either:
             - a str with the name of a pre-trained model to load selected in the list of:
-                . `gpt2`
+                . `gpt2`, `gpt2-medium`
             - a path or url to a pretrained model archive containing:
                 . `gpt2_config.json` a configuration file for the model
                 . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
@@ -147,10 +147,14 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
-        >>> text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(text)
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ])
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load gpt2DoubleHeadsModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index 77162dc244f986..f3d03888aec137 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -126,7 +126,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 
 	Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
@@ -161,15 +161,18 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 
 	Example:
         # Load the tokenizer
-		>>> import torch
+        >>> import torch
         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
-        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load openAIGPTDoubleHeadsModel
         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')

From 382e2d1e50e5ea95c4393b5758e0b0907f43e1c5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 10:37:16 +0200
Subject: [PATCH 101/144] spliting config and weight files for bert also

---
 README.md                                     | 19 ++++
 examples/bertology.py                         | 92 +++++++++++++++++++
 pytorch_pretrained_bert/modeling.py           | 78 +++++++++-------
 pytorch_pretrained_bert/modeling_gpt2.py      |  3 -
 pytorch_pretrained_bert/modeling_openai.py    |  3 -
 .../modeling_transfo_xl.py                    |  3 -
 6 files changed, 158 insertions(+), 40 deletions(-)
 create mode 100644 examples/bertology.py

diff --git a/README.md b/README.md
index 9ddeba808efbbe..a596dc4c649ca0 100644
--- a/README.md
+++ b/README.md
@@ -1432,6 +1432,25 @@ The results were similar to the above FP32 results (actually slightly higher):
 {"exact_match": 84.65468306527909, "f1": 91.238669287002}
 ```
 
+Here is an example with the recent `bert-large-uncased-whole-word-masking`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+  run_squad.py \
+  --bert_model bert-large-uncased-whole-word-masking \
+  --do_train \
+  --do_predict \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
 ## Notebooks
 
 We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
diff --git a/examples/bertology.py b/examples/bertology.py
new file mode 100644
index 00000000000000..0ceac28ff93c45
--- /dev/null
+++ b/examples/bertology.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_pretrained_bert import BertModel, BertTokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def run_model():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased',
+                                                help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--batch_size", type=int, default=-1)
+    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.batch_size == -1:
+        args.batch_size = 1
+    assert args.nsamples % args.batch_size == 0
+
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+    model.to(device)
+    model.eval()
+
+    if args.length == -1:
+        args.length = model.config.n_ctx // 2
+    elif args.length > model.config.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
+
+    while True:
+        context_tokens = []
+        if not args.unconditional:
+            raw_text = input("Model prompt >>> ")
+            while not raw_text:
+                print('Prompt should not be empty!')
+                raw_text = input("Model prompt >>> ")
+            context_tokens = enc.encode(raw_text)
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=context_tokens,
+                    start_token=None,
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:, len(context_tokens):].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
+        else:
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=None,
+                    start_token=enc.encoder['<|endoftext|>'],
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:,1:].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
+
+if __name__ == '__main__':
+    run_model()
+
+
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index fa2cba9aa7e0ce..006e6a1c735937 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -22,9 +22,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
@@ -37,16 +34,28 @@
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
-    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased.tar.gz",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking.tar.gz",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
@@ -642,11 +651,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = pretrained_model_name_or_path
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                 logger.error(
@@ -661,22 +673,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                         ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                         archive_file))
             return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(
+            logger.info("loading weights file {} from cache at {}".format(
                 archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        ### Switching to split config/weight files configuration
+        # tempdir = None
+        # if os.path.isdir(resolved_archive_file) or from_tf:
+        #     serialization_dir = resolved_archive_file
+        # else:
+        #     # Extract archive to temp dir
+        #     tempdir = tempfile.mkdtemp()
+        #     logger.info("extracting archive file {} to temp dir {}".format(
+        #         resolved_archive_file, tempdir))
+        #     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+        #         archive.extractall(tempdir)
+        #     serialization_dir = tempdir
         # Load config
         config_file = os.path.join(serialization_dir, CONFIG_NAME)
         if not os.path.exists(config_file):
@@ -689,9 +705,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(weights_path, map_location='cpu')
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
+        # if tempdir:
+        #     # Clean up temp dir
+        #     shutil.rmtree(tempdir)
         if from_tf:
             # Directly load from a TensorFlow checkpoint
             weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index e3ae47402eacf4..caa9cf809ca957 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -23,9 +23,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 257c2f985c2cfb..d525c96e7722be 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -23,9 +23,6 @@
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open
 
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 12a1535ff641af..b3e829670ae781 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -25,9 +25,6 @@
 import json
 import math
 import logging
-import tarfile
-import tempfile
-import shutil
 import collections
 import sys
 from io import open

From 64e0adda81cc8fca8de9b9a3639d02925b8fdffe Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 10:51:31 +0200
Subject: [PATCH 102/144] better error message

---
 pytorch_pretrained_bert/modeling.py           | 17 ++++++++++-
 pytorch_pretrained_bert/modeling_gpt2.py      | 22 +++++++++++++--
 pytorch_pretrained_bert/modeling_openai.py    | 22 +++++++++++++--
 .../modeling_transfo_xl.py                    | 28 +++++++++++++++----
 4 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 006e6a1c735937..25f9fe79cf832c 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -658,7 +658,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                 logger.error(
@@ -673,6 +672,22 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                         ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                         archive_file))
             return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))
             logger.info("loading configuration file {}".format(config_file))
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index caa9cf809ca957..dd195fc8806bb1 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -493,7 +493,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                 logger.error(
@@ -502,10 +501,27 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             else:
                 logger.error(
                     "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
                     "at this path or url.".format(
                         pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file, config_file
+                        archive_file
+                    )
+                )
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
                     )
                 )
             return None
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index d525c96e7722be..91848f3c6804e2 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -496,7 +496,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None,
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                 logger.error(
@@ -505,10 +504,27 @@ def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None,
             else:
                 logger.error(
                     "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
                     "at this path or url.".format(
                         pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file, config_file
+                        archive_file
+                    )
+                )
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
                     )
                 )
             return None
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index b3e829670ae781..534a111c7781fc 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -921,7 +921,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                 logger.error(
@@ -930,12 +929,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             else:
                 logger.error(
                     "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
                     "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        archive_file, config_file))
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
+                )
             return None
         if resolved_archive_file == archive_file and resolved_config_file == config_file:
             logger.info("loading weights file {}".format(archive_file))

From 868de8d1d7c227cd30d470509c4737b5ce8c083d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 10:58:20 +0200
Subject: [PATCH 103/144] updating weights loading

---
 pytorch_pretrained_bert/modeling.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 25f9fe79cf832c..50742406852e95 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -653,8 +653,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
@@ -708,24 +713,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         #     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
         #         archive.extractall(tempdir)
         #     serialization_dir = tempdir
+        # config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        # if not os.path.exists(config_file):
+        #     # Backward compatibility with old naming format
+        #     config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
         # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        if not os.path.exists(config_file):
-            # Backward compatibility with old naming format
-            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
+        config = BertConfig.from_json_file(resolved_config_file)
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu')
+            # weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
         # if tempdir:
         #     # Clean up temp dir
         #     shutil.rmtree(tempdir)
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            # weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
             return load_tf_weights_in_bert(model, weights_path)
         # Load from a PyTorch state_dict
         old_keys = []

From f964753090cc72df6817e57104d777d77613c865 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 11:36:28 +0200
Subject: [PATCH 104/144] explanation on the current location of the caching
 folder

---
 README.md | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a596dc4c649ca0..b4c0ab9f01b7ca 100644
--- a/README.md
+++ b/README.md
@@ -516,7 +516,9 @@ Here is a detailed documentation of the classes in the package and how to use th
 
 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
+### `from_pretrained()` method
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
 
 ```python
 model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
@@ -581,6 +583,22 @@ model = GPT2Model.from_pretrained('gpt2')
 
 ```
 
+#### Cache directory
+
+`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
+- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
+- PyTorch cache home + `/pytorch_pretrained_bert/`
+  where PyTorch cache home is defined by (in this order):
+  - shell environment variable `ENV_TORCH_HOME`
+  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
+  - default: `~/.cache/torch/`
+
+Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+
+You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
 ### Serialization best-practices
 
 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
@@ -590,6 +608,13 @@ There are three types of files you need to save to be able to reload a fine-tune
 - the configuration file of the model which is saved as a JSON file, and
 - the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 
+The defaults files names of these files are as follow:
+
+- the model weights file: `pytorch_model.bin`,
+- the configuration file: `config.json`,
+- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
+- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
+
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python

From a40955f0710478b141a7bbbca7edea45701eb833 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 11:46:14 +0200
Subject: [PATCH 105/144] no need to duplicate models anymore

---
 examples/run_squad.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index a3525b1ee55709..c3ae8c23b8fce9 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -895,8 +895,8 @@ def main():
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
     # Prepare model
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model),
+                # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
 
     if args.fp16:
         model.half()

From 9ce37af99b39fcc0dcb245fbcb63d9d0510f6834 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 11:47:54 +0200
Subject: [PATCH 106/144] oups

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index c3ae8c23b8fce9..d4bfc025563654 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -895,7 +895,7 @@ def main():
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
     # Prepare model
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model),
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
                 # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
 
     if args.fp16:

From 2ef5e0de871e108cbd8d52c26fb47efeca6ee087 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 12:03:13 +0200
Subject: [PATCH 107/144] switch to pytorch DistributedDataParallel

---
 examples/run_squad.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index d4bfc025563654..4704b7d4e824d5 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -902,12 +902,12 @@ def main():
         model.half()
     model.to(device)
     if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+    #     try:
+    #         from apex.parallel import DistributedDataParallel as DDP
+    #     except ImportError:
+    #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 
-        model = DDP(model)
+        model = torch.nn.parallel.DistributedDataParallel(model)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 

From a59abedfb5301ede6923ef0312de2ae5fa34fc97 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 12:06:26 +0200
Subject: [PATCH 108/144] DDP update

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 4704b7d4e824d5..6378f443e4f32a 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -907,7 +907,7 @@ def main():
     #     except ImportError:
     #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 
-        model = torch.nn.parallel.DistributedDataParallel(model)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 

From d82e5deeb126480b47f4368141c9fc7c0d733d45 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 12:13:14 +0200
Subject: [PATCH 109/144] set find_unused_parameters=True in DDP

---
 README.md             | 7 ++++---
 examples/run_squad.py | 5 ++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b4c0ab9f01b7ca..cd6abcacdd9807 100644
--- a/README.md
+++ b/README.md
@@ -1468,12 +1468,13 @@ python -m torch.distributed.launch --nproc_per_node=8 \
   --do_lower_case \
   --train_file $SQUAD_DIR/train-v1.1.json \
   --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --train_batch_size 12 \
   --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
+  --num_train_epochs 2 \
   --max_seq_length 384 \
   --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
+  --output_dir /tmp/debug_squad/ \
+  --train_batch_size 24 \
+  --gradient_accumulation_steps 2
 ```
 
 ## Notebooks
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 6378f443e4f32a..313cb453aff1fa 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -907,7 +907,10 @@ def main():
     #     except ImportError:
     #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 

From 326944d627ad166f0e6a6921b7168a2caf31dd1e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 14:02:42 +0200
Subject: [PATCH 110/144] add tensorboard to run_squad

---
 examples/run_squad.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 313cb453aff1fa..775e93e4dbdbea 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -34,6 +34,8 @@
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
+from tensorboardX import SummaryWriter
+
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
@@ -915,9 +917,8 @@ def main():
         model = torch.nn.DataParallel(model)
 
     if args.do_train:
-
+        writer = SummaryWriter()
         # Prepare data loader
-
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
         cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
@@ -999,7 +1000,7 @@ def main():
         logger.info("  Num steps = %d", num_train_optimization_steps)
 
         model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                 if n_gpu == 1:
                     batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
@@ -1015,6 +1016,8 @@ def main():
                 else:
                     loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
+                    writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                    writer.add_scalar('loss', loss.item(), global_step)
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically

From 335f57baf86094907a14de7ddc9f3e791ae3519b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 14:03:46 +0200
Subject: [PATCH 111/144] only on main process

---
 examples/run_squad.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 775e93e4dbdbea..e6afeab1c2d012 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -917,7 +917,8 @@ def main():
         model = torch.nn.DataParallel(model)
 
     if args.do_train:
-        writer = SummaryWriter()
+        if args.local_rank in [-1, 0]:
+            writer = SummaryWriter()
         # Prepare data loader
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
@@ -1016,8 +1017,9 @@ def main():
                 else:
                     loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                    writer.add_scalar('loss', loss.item(), global_step)
+                    if args.local_rank in [-1, 0]:
+                        writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        writer.add_scalar('loss', loss.item(), global_step)
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically

From c5407f343f428957e44b0898a9d3f7347c10b63f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 14:29:03 +0200
Subject: [PATCH 112/144] split squad example in two

---
 examples/run_squad.py               | 729 +--------------------------
 examples/run_squad_dataset_utils.py | 740 ++++++++++++++++++++++++++++
 2 files changed, 746 insertions(+), 723 deletions(-)
 create mode 100644 examples/run_squad_dataset_utils.py

diff --git a/examples/run_squad.py b/examples/run_squad.py
index e6afeab1c2d012..e904187500ee7e 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -36,12 +36,12 @@
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
-                                                  BertTokenizer,
-                                                  whitespace_tokenize)
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+from run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -51,717 +51,6 @@
 logger = logging.getLogger(__name__)
 
 
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            start_position = None
-            end_position = None
-            if is_training and not example.is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
-                    logger.info("impossible example")
-                if is_training and not example.is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=example.is_impossible))
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    logger.info("Writing predictions to: %s" % (output_prediction_file))
-    logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-                
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
-                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
 def main():
     parser = argparse.ArgumentParser()
 
@@ -898,17 +187,11 @@ def main():
 
     # Prepare model
     model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-                # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
 
     if args.fp16:
         model.half()
     model.to(device)
     if args.local_rank != -1:
-    #     try:
-    #         from apex.parallel import DistributedDataParallel as DDP
-    #     except ImportError:
-    #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
         model = torch.nn.parallel.DistributedDataParallel(model,
                                                           device_ids=[args.local_rank],
                                                           output_device=args.local_rank,
@@ -939,6 +222,7 @@ def main():
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
                 with open(cached_train_features_file, "wb") as writer:
                     pickle.dump(train_features, writer)
+
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@@ -956,7 +240,6 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
         # Prepare optimizer
-
         param_optimizer = list(model.named_parameters())
 
         # hack to remove pooler, which is not used
diff --git a/examples/run_squad_dataset_utils.py b/examples/run_squad_dataset_utils.py
new file mode 100644
index 00000000000000..4043ee57f81401
--- /dev/null
+++ b/examples/run_squad_dataset_utils.py
@@ -0,0 +1,740 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Load SQuAD dataset. """
+
+from __future__ import absolute_import, division, print_function
+
+import json
+import logging
+import math
+import collections
+from io import open
+
+from pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize
+
+logger = logging.getLogger(__name__)
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset.
+    For examples without an answer, the start and end position are -1.
+    """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (self.qas_id)
+        s += ", question_text: %s" % (
+            self.question_text)
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.end_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.is_impossible:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r", encoding='utf-8') as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer.")
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.warning("Could not find answer: '%s' vs. '%s'",
+                                           actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+    return examples
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length, is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+            if example_index < 20:
+                logger.info("*** Example ***")
+                logger.info("unique_id: %s" % (unique_id))
+                logger.info("example_index: %s" % (example_index))
+                logger.info("doc_span_index: %s" % (doc_span_index))
+                logger.info("tokens: %s" % " ".join(tokens))
+                logger.info("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
+                logger.info("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
+                ]))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info(
+                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    logger.info("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    logger.info("start_position: %d" % (start_position))
+                    logger.info("end_position: %d" % (end_position))
+                    logger.info(
+                        "answer: %s" % (answer_text))
+
+            features.append(
+                InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible))
+            unique_id += 1
+
+    return features
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+                      version_2_with_negative, null_score_diff_threshold):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+                
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest)==1:
+                nbest.insert(0,
+                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs

From a432b3d466132e446e2c452a9012bb576cf9f361 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 14:39:09 +0200
Subject: [PATCH 113/144] distributed traing t_total

---
 examples/run_squad.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index e904187500ee7e..fb3b4b7d34af9d 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -234,10 +234,11 @@ def main():
             train_sampler = RandomSampler(train_data)
         else:
             train_sampler = DistributedSampler(train_data)
+
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
         num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+        # if args.local_rank != -1:
+        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
         # Prepare optimizer
         param_optimizer = list(model.named_parameters())

From e6e5f1925764dbec2d5bdeec60a3216995bacc49 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 14:45:14 +0200
Subject: [PATCH 114/144] fix

---
 examples/run_squad.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index fb3b4b7d34af9d..ce17b789e269db 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -201,7 +201,7 @@ def main():
 
     if args.do_train:
         if args.local_rank in [-1, 0]:
-            writer = SummaryWriter()
+            tb_writer = SummaryWriter()
         # Prepare data loader
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
@@ -302,8 +302,8 @@ def main():
                     loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.local_rank in [-1, 0]:
-                        writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        writer.add_scalar('loss', loss.item(), global_step)
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically

From 15ebd67d4e1bd596ad0d2d54bbf3ccc1ae799d27 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 15:58:22 +0200
Subject: [PATCH 115/144] cache in run_classifier + various fixes to the
 examples

---
 README.md                                |  56 ++-
 examples/bertology.py                    |  53 +-
 examples/run_classifier.py               | 600 ++---------------------
 examples/run_classifier_dataset_utils.py | 571 +++++++++++++++++++++
 examples/run_squad.py                    |   9 +-
 5 files changed, 665 insertions(+), 624 deletions(-)
 create mode 100644 examples/run_classifier_dataset_utils.py

diff --git a/README.md b/README.md
index cd6abcacdd9807..7d7fea41824583 100644
--- a/README.md
+++ b/README.md
@@ -541,6 +541,7 @@ where
     - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
     - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
     - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
     - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
     - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
@@ -608,13 +609,15 @@ There are three types of files you need to save to be able to reload a fine-tune
 - the configuration file of the model which is saved as a JSON file, and
 - the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 
-The defaults files names of these files are as follow:
+The *default filenames* of these files are as follow:
 
 - the model weights file: `pytorch_model.bin`,
 - the configuration file: `config.json`,
 - the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
 - for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
 
+**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
+
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python
@@ -1268,6 +1271,30 @@ python run_classifier.py \
   --fp16
 ```
 
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 93 on SQuAD:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+ run_classifier.py \
+  --bert_model bert-large-cased-whole-word-masking  \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --max_seq_length 128 \
+  --train_batch_size 64 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+```
+
 #### SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
@@ -1298,9 +1325,36 @@ python run_squad.py \
 
 Training with the previous hyper-parameters gave us the following results:
 ```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 93 on SQuAD:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+ run_squad.py \
+ --bert_model bert-large-cased-whole-word-masking  \
+ --do_train \
+ --do_predict \
+ --do_lower_case \
+ --train_file $SQUAD_DIR/train-v1.1.json \
+ --predict_file $SQUAD_DIR/dev-v1.1.json \
+ --learning_rate 3e-5 \
+ --num_train_epochs 2 \
+ --max_seq_length 384 \
+ --doc_stride 128 \
+ --output_dir ../models/train_squad_large_cased_wwm/ \
+ --train_batch_size 24 \
+ --gradient_accumulation_steps 12
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/train_squad_large_cased_wwm/predictions.json
+{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+```
+
 #### SWAG
 
 The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
diff --git a/examples/bertology.py b/examples/bertology.py
index 0ceac28ff93c45..7db2f9e51e4f2b 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -20,8 +20,6 @@ def run_model():
     parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased',
                                                 help='pretrained model name or path to local checkpoint')
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--batch_size", type=int, default=-1)
-    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
     args = parser.parse_args()
     print(args)
 
@@ -34,57 +32,12 @@ def run_model():
     torch.cuda.manual_seed(args.seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
+    model = BertModel.from_pretrained(args.model_name_or_path)
     model.to(device)
     model.eval()
 
-    if args.length == -1:
-        args.length = model.config.n_ctx // 2
-    elif args.length > model.config.n_ctx:
-        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
-
-    while True:
-        context_tokens = []
-        if not args.unconditional:
-            raw_text = input("Model prompt >>> ")
-            while not raw_text:
-                print('Prompt should not be empty!')
-                raw_text = input("Model prompt >>> ")
-            context_tokens = enc.encode(raw_text)
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=context_tokens,
-                    start_token=None,
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:, len(context_tokens):].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-        else:
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=None,
-                    start_token=enc.encoder['<|endoftext|>'],
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:,1:].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
+    
 
 if __name__ == '__main__':
     run_model()
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 94099204deef3f..20bc59ded8b63c 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -18,548 +18,29 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import csv
 import logging
 import os
 import random
-import sys
+from tqdm import tqdm, trange
 
 import numpy as np
-import math
+
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
 from torch.nn import CrossEntropyLoss, MSELoss
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
+from tensorboardX import SummaryWriter
+
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
-logger = logging.getLogger(__name__)
+from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
 
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
-            "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = {label : i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        segment_ids = [0] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            segment_ids += [1] * (len(tokens_b) + 1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = float(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-
-
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-
-
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
+logger = logging.getLogger(__name__)
 
 
 def main():
@@ -661,31 +142,6 @@ def main():
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
 
-    processors = {
-        "cola": ColaProcessor,
-        "mnli": MnliProcessor,
-        "mnli-mm": MnliMismatchedProcessor,
-        "mrpc": MrpcProcessor,
-        "sst-2": Sst2Processor,
-        "sts-b": StsbProcessor,
-        "qqp": QqpProcessor,
-        "qnli": QnliProcessor,
-        "rte": RteProcessor,
-        "wnli": WnliProcessor,
-    }
-
-    output_modes = {
-        "cola": "classification",
-        "mnli": "classification",
-        "mrpc": "classification",
-        "sst-2": "classification",
-        "sts-b": "regression",
-        "qqp": "classification",
-        "qnli": "classification",
-        "rte": "classification",
-        "wnli": "classification",
-    }
-
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         n_gpu = torch.cuda.device_count()
@@ -737,30 +193,39 @@ def main():
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
     # Prepare model
-    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
-    model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=cache_dir,
-              num_labels=num_labels)
+    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     if args.fp16:
         model.half()
     model.to(device)
     if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
     if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
 
         # Prepare data loader
-
         train_examples = processor.get_train_examples(args.data_dir)
-        train_features = convert_examples_to_features(
-            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        cached_train_features_file = args.data_dir + '_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@@ -778,8 +243,6 @@ def main():
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
         # Prepare optimizer
 
@@ -863,6 +326,9 @@ def main():
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
 
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
diff --git a/examples/run_classifier_dataset_utils.py b/examples/run_classifier_dataset_utils.py
new file mode 100644
index 00000000000000..4924afaea25192
--- /dev/null
+++ b/examples/run_classifier_dataset_utils.py
@@ -0,0 +1,571 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import logging
+import os
+import sys
+
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import matthews_corrcoef, f1_score
+
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
+            "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label : i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                    [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              input_mask=input_mask,
+                              segment_ids=segment_ids,
+                              label_id=label_id))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
diff --git a/examples/run_squad.py b/examples/run_squad.py
index ce17b789e269db..32e20f9c9457d4 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -18,10 +18,7 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import collections
-import json
 import logging
-import math
 import os
 import random
 import sys
@@ -301,9 +298,6 @@ def main():
                 else:
                     loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
                         # if args.fp16 is False, BertAdam is used and handles this automatically
@@ -313,6 +307,9 @@ def main():
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
 
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer

From 9710b68dbcdfc2b94264a68d35885ee70faaeb2f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:01:15 +0200
Subject: [PATCH 116/144] fix pickles

---
 examples/run_classifier.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 20bc59ded8b63c..45131103abb76f 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -40,6 +40,12 @@
 
 from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
 
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
 logger = logging.getLogger(__name__)
 
 

From 97277232432f3188e6d8f5b75f428ada21773254 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:02:42 +0200
Subject: [PATCH 117/144] fix pickle

---
 examples/run_classifier.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 45131103abb76f..6166cd71948a5b 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -20,6 +20,7 @@
 import argparse
 import logging
 import os
+import sys
 import random
 from tqdm import tqdm, trange
 

From 7388c83b60e97c65e399fbb88b0da1ade9897dc0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:32:49 +0200
Subject: [PATCH 118/144] update run_classifier for distributed eval

---
 examples/run_classifier.py | 47 +++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 6166cd71948a5b..49fb3954b32d23 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -50,6 +50,15 @@
 logger = logging.getLogger(__name__)
 
 
+def average_distributed_scalar(scalar, args):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if args.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -158,6 +167,7 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+    args.device = device
 
     logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                         datefmt = '%m/%d/%Y %H:%M:%S',
@@ -337,6 +347,8 @@ def main():
                         tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                         tb_writer.add_scalar('loss', loss.item(), global_step)
 
+    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    ### Example:
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
@@ -352,11 +364,21 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-    model.to(device)
 
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Distributed/fp16/parallel settings (optional)
+        model.to(device)
+        if args.fp16:
+            model.half()
+        if args.local_rank != -1:
+            model = torch.nn.parallel.DistributedDataParallel(model,
+                                                                device_ids=[args.local_rank],
+                                                                output_device=args.local_rank,
+                                                                find_unused_parameters=True)
+        elif n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+    ### Evaluation
+    if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)
         eval_features = convert_examples_to_features(
             eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
@@ -374,7 +396,10 @@ def main():
 
         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
         # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
+        if args.local_rank == -1:
+            eval_sampler = SequentialSampler(eval_data)
+        else:
+            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
         eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         model.eval()
@@ -398,7 +423,7 @@ def main():
             elif output_mode == "regression":
                 loss_fct = MSELoss()
                 tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
-            
+
             eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
             if len(preds) == 0:
@@ -414,6 +439,11 @@ def main():
         elif output_mode == "regression":
             preds = np.squeeze(preds)
         result = compute_metrics(task_name, preds, all_label_ids.numpy())
+
+        if args.local_rank != -1:
+            # Average over distributed nodes if needed
+            result = {key: average_distributed_scalar(value, args) for key, value in result.items()}
+
         loss = tr_loss/global_step if args.do_train else None
 
         result['eval_loss'] = eval_loss
@@ -482,6 +512,11 @@ def main():
             preds = preds[0]
             preds = np.argmax(preds, axis=1)
             result = compute_metrics(task_name, preds, all_label_ids.numpy())
+
+            if args.local_rank != -1:
+                # Average over distributed nodes if needed
+                result = {key: average_distributed_scalar(value, args) for key, value in result.items()}
+
             loss = tr_loss/global_step if args.do_train else None
 
             result['eval_loss'] = eval_loss

From 40dbda6871263067e3cf2030a1e9aaffef7837e5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:45:52 +0200
Subject: [PATCH 119/144] updating classification example

---
 examples/run_classifier.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 49fb3954b32d23..0add05113fe6c7 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -228,10 +228,10 @@ def main():
 
         # Prepare data loader
         train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = args.data_dir + '_{0}_{1}_{2}'.format(
+        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
             list(filter(None, args.bert_model.split('/'))).pop(),
                         str(args.max_seq_length),
-                        str(task_name))
+                        str(task_name)))
         try:
             with open(cached_train_features_file, "rb") as reader:
                 train_features = pickle.load(reader)
@@ -311,7 +311,7 @@ def main():
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, segment_ids, input_mask, labels=None)
+                logits = model(input_ids, segment_ids, input_mask)
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
@@ -380,6 +380,22 @@ def main():
     ### Evaluation
     if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)
+        cached_train_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name)))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
+
         eval_features = convert_examples_to_features(
             eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
         logger.info("***** Running evaluation *****")
@@ -414,7 +430,7 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, segment_ids, input_mask, labels=None)
+                logits = model(input_ids, segment_ids, input_mask)
 
             # create eval loss and other metric required by the task
             if output_mode == "classification":

From aad3a54e9ce1bdc1bcb9309e3ebdea03dbeee588 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:48:04 +0200
Subject: [PATCH 120/144] fix paths

---
 examples/run_classifier.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 0add05113fe6c7..d945b0dfc84d39 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -380,24 +380,22 @@ def main():
     ### Evaluation
     if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
             list(filter(None, args.bert_model.split('/'))).pop(),
                         str(args.max_seq_length),
                         str(task_name)))
         try:
-            with open(cached_train_features_file, "rb") as reader:
+            with open(cached_eval_features_file, "rb") as reader:
                 train_features = pickle.load(reader)
         except:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            eval_features = convert_examples_to_features(
+                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
+                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
+                with open(cached_eval_features_file, "wb") as writer:
+                    pickle.dump(eval_features, writer)
 
 
-        eval_features = convert_examples_to_features(
-            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
         logger.info("***** Running evaluation *****")
         logger.info("  Num examples = %d", len(eval_examples))
         logger.info("  Batch size = %d", args.eval_batch_size)

From 3e847449adaeaab6963462c32539f5f17daecf6f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:53:31 +0200
Subject: [PATCH 121/144] fix out_label_ids

---
 examples/run_classifier.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index d945b0dfc84d39..b1749ec8be3f83 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -420,6 +420,7 @@ def main():
         eval_loss = 0
         nb_eval_steps = 0
         preds = []
+        out_label_ids = []
 
         for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
             input_ids = input_ids.to(device)
@@ -442,9 +443,12 @@ def main():
             nb_eval_steps += 1
             if len(preds) == 0:
                 preds.append(logits.detach().cpu().numpy())
+                out_label_ids.append(label_ids.detach().cpu().numpy())
             else:
                 preds[0] = np.append(
                     preds[0], logits.detach().cpu().numpy(), axis=0)
+                out_label_ids[0] = np.append(
+                    out_label_ids[0], label_ids.detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         preds = preds[0]
@@ -452,7 +456,7 @@ def main():
             preds = np.argmax(preds, axis=1)
         elif output_mode == "regression":
             preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, all_label_ids.numpy())
+        result = compute_metrics(task_name, preds, out_label_ids.numpy())
 
         if args.local_rank != -1:
             # Average over distributed nodes if needed
@@ -501,6 +505,7 @@ def main():
             eval_loss = 0
             nb_eval_steps = 0
             preds = []
+            out_label_ids = []
 
             for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                 input_ids = input_ids.to(device)
@@ -518,14 +523,18 @@ def main():
                 nb_eval_steps += 1
                 if len(preds) == 0:
                     preds.append(logits.detach().cpu().numpy())
+                    out_label_ids.append(label_ids.detach().cpu().numpy())
                 else:
                     preds[0] = np.append(
                         preds[0], logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids[0] = np.append(
+                        out_label_ids[0], label_ids.detach().cpu().numpy(), axis=0)
+
 
             eval_loss = eval_loss / nb_eval_steps
             preds = preds[0]
             preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, all_label_ids.numpy())
+            result = compute_metrics(task_name, preds, out_label_ids.numpy())
 
             if args.local_rank != -1:
                 # Average over distributed nodes if needed

From 8bd9118294985377fe6fc146abcc908b789cf11c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:54:41 +0200
Subject: [PATCH 122/144] quick fix

---
 examples/run_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index b1749ec8be3f83..90f9e08ebc91c0 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -386,7 +386,7 @@ def main():
                         str(task_name)))
         try:
             with open(cached_eval_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
+                eval_features = pickle.load(reader)
         except:
             eval_features = convert_examples_to_features(
                 eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)

From f55b60b9ee9bc8f7f8ecf04f5d53d0417fbce3d8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:56:52 +0200
Subject: [PATCH 123/144] fixing again

---
 examples/run_classifier.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 90f9e08ebc91c0..2667b82d7273cd 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -420,7 +420,7 @@ def main():
         eval_loss = 0
         nb_eval_steps = 0
         preds = []
-        out_label_ids = []
+        out_label_ids = None
 
         for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
             input_ids = input_ids.to(device)
@@ -443,12 +443,12 @@ def main():
             nb_eval_steps += 1
             if len(preds) == 0:
                 preds.append(logits.detach().cpu().numpy())
-                out_label_ids.append(label_ids.detach().cpu().numpy())
+                out_label_ids = label_ids.detach().cpu().numpy())
             else:
                 preds[0] = np.append(
                     preds[0], logits.detach().cpu().numpy(), axis=0)
-                out_label_ids[0] = np.append(
-                    out_label_ids[0], label_ids.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(
+                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         preds = preds[0]
@@ -505,7 +505,7 @@ def main():
             eval_loss = 0
             nb_eval_steps = 0
             preds = []
-            out_label_ids = []
+            out_label_ids = None
 
             for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                 input_ids = input_ids.to(device)
@@ -523,13 +523,12 @@ def main():
                 nb_eval_steps += 1
                 if len(preds) == 0:
                     preds.append(logits.detach().cpu().numpy())
-                    out_label_ids.append(label_ids.detach().cpu().numpy())
+                    out_label_ids = label_ids.detach().cpu().numpy())
                 else:
                     preds[0] = np.append(
                         preds[0], logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids[0] = np.append(
-                        out_label_ids[0], label_ids.detach().cpu().numpy(), axis=0)
-
+                    out_label_ids = np.append(
+                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
 
             eval_loss = eval_loss / nb_eval_steps
             preds = preds[0]

From 4e6edc3274a4aa2dbd86dbe58b8567bf9e40870f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 16:57:15 +0200
Subject: [PATCH 124/144] hop

---
 examples/run_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 2667b82d7273cd..b2acffe0c35a70 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -443,7 +443,7 @@ def main():
             nb_eval_steps += 1
             if len(preds) == 0:
                 preds.append(logits.detach().cpu().numpy())
-                out_label_ids = label_ids.detach().cpu().numpy())
+                out_label_ids = label_ids.detach().cpu().numpy()
             else:
                 preds[0] = np.append(
                     preds[0], logits.detach().cpu().numpy(), axis=0)
@@ -523,7 +523,7 @@ def main():
                 nb_eval_steps += 1
                 if len(preds) == 0:
                     preds.append(logits.detach().cpu().numpy())
-                    out_label_ids = label_ids.detach().cpu().numpy())
+                    out_label_ids = label_ids.detach().cpu().numpy()
                 else:
                     preds[0] = np.append(
                         preds[0], logits.detach().cpu().numpy(), axis=0)

From 92e0ad5aba669a13d37a73dc9efd5faa98ef88ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 17:00:52 +0200
Subject: [PATCH 125/144] no numpy

---
 examples/run_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index b2acffe0c35a70..63cc2b4b9ce320 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -456,7 +456,7 @@ def main():
             preds = np.argmax(preds, axis=1)
         elif output_mode == "regression":
             preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, out_label_ids.numpy())
+        result = compute_metrics(task_name, preds, out_label_ids)
 
         if args.local_rank != -1:
             # Average over distributed nodes if needed
@@ -533,7 +533,7 @@ def main():
             eval_loss = eval_loss / nb_eval_steps
             preds = preds[0]
             preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, out_label_ids.numpy())
+            result = compute_metrics(task_name, preds, out_label_ids)
 
             if args.local_rank != -1:
                 # Average over distributed nodes if needed

From 16a1f338c4b0f90e3476a777b06c438e21b87b37 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 17:06:31 +0200
Subject: [PATCH 126/144] fixing

---
 examples/run_classifier.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 63cc2b4b9ce320..7c00e4833da2d6 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -222,6 +222,10 @@ def main():
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    global_step = 0
+    nb_tr_steps = 0
+    tr_loss = 0
+
     if args.do_train:
         if args.local_rank in [-1, 0]:
             tb_writer = SummaryWriter()
@@ -293,10 +297,6 @@ def main():
                                  warmup=args.warmup_proportion,
                                  t_total=num_train_optimization_steps)
 
-        global_step = 0
-        nb_tr_steps = 0
-        tr_loss = 0
-
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)

From 7d2001aa44ac2ac9410d75d71845dbbd87f910e2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 22:13:30 +0200
Subject: [PATCH 127/144] overwrite_output_dir

---
 examples/run_classifier.py | 5 ++++-
 examples/run_squad.py      | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 7c00e4833da2d6..c3a16f593d0f12 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -127,6 +127,9 @@ def main():
     parser.add_argument("--no_cuda",
                         action='store_true',
                         help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -191,7 +194,7 @@ def main():
     if not args.do_train and not args.do_eval:
         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 32e20f9c9457d4..f20dd9d3564f01 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -111,6 +111,9 @@ def main():
     parser.add_argument('--fp16',
                         action='store_true',
                         help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
     parser.add_argument('--loss_scale',
                         type=float, default=0,
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
@@ -175,7 +178,7 @@ def main():
             raise ValueError(
                 "If `do_predict` is True, then `predict_file` must be specified.")
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory () already exists and is not empty.")
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)

From 29b7b30eaae52d97d4391353c562d8008f036b87 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 22:20:21 +0200
Subject: [PATCH 128/144] updating evaluation on a single gpu

---
 examples/run_classifier.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index c3a16f593d0f12..27a17d7e3104a8 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -306,10 +306,10 @@ def main():
         logger.info("  Num steps = %d", num_train_optimization_steps)
 
         model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, label_ids = batch
 
@@ -367,21 +367,13 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+    else:
+        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
-        # Distributed/fp16/parallel settings (optional)
-        model.to(device)
-        if args.fp16:
-            model.half()
-        if args.local_rank != -1:
-            model = torch.nn.parallel.DistributedDataParallel(model,
-                                                                device_ids=[args.local_rank],
-                                                                output_device=args.local_rank,
-                                                                find_unused_parameters=True)
-        elif n_gpu > 1:
-            model = torch.nn.DataParallel(model)
+    model.to(device)
 
     ### Evaluation
-    if args.do_eval:
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         eval_examples = processor.get_dev_examples(args.data_dir)
         cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
             list(filter(None, args.bert_model.split('/'))).pop(),

From 3359955622a10b46e0360e5040eeb7e3725eecfc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 22:23:10 +0200
Subject: [PATCH 129/144] updating run_classif

---
 examples/run_classifier.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 27a17d7e3104a8..47cf43e17cfb6c 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -50,15 +50,6 @@
 logger = logging.getLogger(__name__)
 
 
-def average_distributed_scalar(scalar, args):
-    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
-    if args.local_rank == -1:
-        return scalar
-    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
-    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
-    return scalar_t.item()
-
-
 def main():
     parser = argparse.ArgumentParser()
 
@@ -368,7 +359,7 @@ def main():
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
     else:
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+        model = BertForSequenceClassification.from_pretrained(args.bert_model)
 
     model.to(device)
 
@@ -453,10 +444,6 @@ def main():
             preds = np.squeeze(preds)
         result = compute_metrics(task_name, preds, out_label_ids)
 
-        if args.local_rank != -1:
-            # Average over distributed nodes if needed
-            result = {key: average_distributed_scalar(value, args) for key, value in result.items()}
-
         loss = tr_loss/global_step if args.do_train else None
 
         result['eval_loss'] = eval_loss
@@ -510,10 +497,10 @@ def main():
 
                 with torch.no_grad():
                     logits = model(input_ids, segment_ids, input_mask, labels=None)
-            
+
                 loss_fct = CrossEntropyLoss()
                 tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            
+
                 eval_loss += tmp_eval_loss.mean().item()
                 nb_eval_steps += 1
                 if len(preds) == 0:
@@ -530,10 +517,6 @@ def main():
             preds = np.argmax(preds, axis=1)
             result = compute_metrics(task_name, preds, out_label_ids)
 
-            if args.local_rank != -1:
-                # Average over distributed nodes if needed
-                result = {key: average_distributed_scalar(value, args) for key, value in result.items()}
-
             loss = tr_loss/global_step if args.do_train else None
 
             result['eval_loss'] = eval_loss

From 4d8c4337ae384262b48eed646f8586704e1bc530 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 22:41:28 +0200
Subject: [PATCH 130/144] test barrier in distrib training

---
 README.md                           | 23 ++++++++---------------
 examples/run_classifier.py          | 13 +++++++++++--
 pytorch_pretrained_bert/modeling.py |  4 ++++
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 7d7fea41824583..b0a155f140bfa5 100644
--- a/README.md
+++ b/README.md
@@ -1272,27 +1272,20 @@ python run_classifier.py \
 ```
 
 **Distributed training**
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 93 on SQuAD:
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 \
- run_classifier.py \
-  --bert_model bert-large-cased-whole-word-masking  \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --train_batch_size 64 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
+python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
 ```
 
 Training with these hyper-parameters gave us the following results:
 ```bash
-{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+  acc = 0.8823529411764706
+  acc_and_f1 = 0.901702786377709
+  eval_loss = 0.3418912578906332
+  f1 = 0.9210526315789473
+  global_step = 174
+  loss = 0.07231863956341798
 ```
 
 #### SQuAD
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 47cf43e17cfb6c..123efb9147d9cc 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -50,6 +50,12 @@
 logger = logging.getLogger(__name__)
 
 
+def barrier():
+    t = torch.randn((), device='cuda')
+    torch.distributed.all_reduce(t)
+    torch.cuda.synchronize()
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -201,10 +207,13 @@ def main():
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
+    if args.local_rank not in [-1, 0]:
+        barrier()  # Make sure only the first process in distributed training will download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    # Prepare model
     model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+    if args.local_rank == 0:
+        barrier()
+
     if args.fp16:
         model.half()
     model.to(device)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 50742406852e95..4dfffb8e432840 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -44,6 +44,10 @@
     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-uncased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-finetuned-mrpc-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-mrpc-pytorch_model.bin",
 }
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",

From f7e2ac01ea4043cb967fe75789f8e4936324fa50 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 18 Jun 2019 22:43:35 +0200
Subject: [PATCH 131/144] update barrier

---
 examples/run_classifier.py | 10 ++--------
 examples/run_squad.py      |  6 ++++--
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 123efb9147d9cc..e708671e421912 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -50,12 +50,6 @@
 logger = logging.getLogger(__name__)
 
 
-def barrier():
-    t = torch.randn((), device='cuda')
-    torch.distributed.all_reduce(t)
-    torch.cuda.synchronize()
-
-
 def main():
     parser = argparse.ArgumentParser()
 
@@ -208,11 +202,11 @@ def main():
     num_labels = len(label_list)
 
     if args.local_rank not in [-1, 0]:
-        barrier()  # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
     model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     if args.local_rank == 0:
-        barrier()
+        torch.distributed.barrier()
 
     if args.fp16:
         model.half()
diff --git a/examples/run_squad.py b/examples/run_squad.py
index f20dd9d3564f01..0d0f52e7605e57 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -183,10 +183,12 @@ def main():
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    # Prepare model
     model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
 
     if args.fp16:
         model.half()

From 68ab9599ce3aefbd25d1c81e3315d1968849b628 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 09:38:38 +0200
Subject: [PATCH 132/144] small fix and updates to readme

---
 README.md                           | 24 ++++++++++++++++++----
 examples/bertology.py               | 31 +++++++++++++++++++----------
 examples/run_classifier.py          |  6 +++++-
 examples/run_squad.py               |  4 ++++
 pytorch_pretrained_bert/modeling.py |  6 ++++--
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index b0a155f140bfa5..a48f8e3cf51a34 100644
--- a/README.md
+++ b/README.md
@@ -1322,12 +1322,14 @@ python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/pre
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 93 on SQuAD:
+**distributed training**
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 \
  run_squad.py \
- --bert_model bert-large-cased-whole-word-masking  \
+ --bert_model bert-large-uncased-whole-word-masking  \
  --do_train \
  --do_predict \
  --do_lower_case \
@@ -1337,17 +1339,31 @@ python -m torch.distributed.launch --nproc_per_node=8 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
- --output_dir ../models/train_squad_large_cased_wwm/ \
+ --output_dir ../models/wwm_uncased_finetuned_squad/ \
  --train_batch_size 24 \
  --gradient_accumulation_steps 12
 ```
 
 Training with these hyper-parameters gave us the following results:
 ```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/train_squad_large_cased_wwm/predictions.json
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
 ```
 
+This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
+
+And here is the model provided as `bert-large-cased-whole-word-masking-finetuned-squad`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8  run_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+{"exact_match": 84.18164616840113, "f1": 91.58645594850135}
+```
+
 #### SWAG
 
 The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
diff --git a/examples/bertology.py b/examples/bertology.py
index 7db2f9e51e4f2b..b7e73e30d4f8da 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_pretrained_bert import BertModel, BertTokenizer
+from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -17,24 +17,33 @@
 
 def run_model():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased',
-                                                help='pretrained model name or path to local checkpoint')
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased', help='pretrained model name or path to local checkpoint')
     parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
     args = parser.parse_args()
-    print(args)
-
-    if args.batch_size == -1:
-        args.batch_size = 1
-    assert args.nsamples % args.batch_size == 0
 
     np.random.seed(args.seed)
     torch.random.manual_seed(args.seed)
     torch.cuda.manual_seed(args.seed)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        args.device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
-    model = BertModel.from_pretrained(args.model_name_or_path)
-    model.to(device)
+    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path)
+    model.to(args.device)
     model.eval()
 
     
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index e708671e421912..eda96f81e316d2 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -187,7 +187,7 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
 
     task_name = args.task_name.lower()
@@ -361,6 +361,10 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model)
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 0d0f52e7605e57..bf1763e884a463 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -331,6 +331,10 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForQuestionAnswering.from_pretrained(args.output_dir)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 4dfffb8e432840..d7493f07ca5d00 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -46,8 +46,7 @@
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-uncased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-finetuned-mrpc-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-mrpc-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
@@ -60,6 +59,9 @@
     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'

From dc8e0019b7feacd546236dc3361efd05f28b9137 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 13:23:20 +0200
Subject: [PATCH 133/144] updating examples

---
 README.md                               |  23 +++
 examples/bertology.py                   | 202 +++++++++++++++++++++---
 examples/run_classifier.py              |   2 +-
 pytorch_pretrained_bert/modeling.py     |  21 ---
 pytorch_pretrained_bert/tokenization.py |   6 +
 5 files changed, 212 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index a48f8e3cf51a34..287cc207e18e8d 100644
--- a/README.md
+++ b/README.md
@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
   loss = 0.07231863956341798
 ```
 
+Here is an example on MNLI:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+```
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
+
+
 #### SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
diff --git a/examples/bertology.py b/examples/bertology.py
index b7e73e30d4f8da..f7aa4b99708720 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -1,32 +1,108 @@
 #!/usr/bin/env python3
-
+import os
 import argparse
 import logging
-from tqdm import trange
+from tqdm import tqdm
 
-import torch
-import torch.nn.functional as F
 import numpy as np
 
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
 from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
+
+
 logger = logging.getLogger(__name__)
 
+
+def entropy(p):
+    plogp = p * torch.log(p)
+    plogp[p == 0] = 0
+    return -plogp.sum(dim=-1)
+
+def print_1d_tensor(tensor, prefix=""):
+    if tensor.dtype != torch.long:
+        logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
+    else:
+        logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
+
+def print_2d_tensor(tensor):
+    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
+    for row in range(len(tensor)):
+        print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
+
+def compute_heads_importance(args, model, eval_dataloader):
+    """ Example on how to use model outputs to compute:
+        - head attention entropy (activated by setting output_attentions=True when we created the model
+        - head importance scores according to http://arxiv.org/abs/1905.10650
+            (activated by setting keep_multihead_output=True when we created the model)
+    """
+    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        # Do a forward pass
+        all_attentions, logits = model(input_ids, segment_ids, input_mask)
+
+        # Update head attention entropy
+        for layer, attn in enumerate(all_attentions):
+            masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+            attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
+
+        # Update head importance scores with regards to our loss
+        # First backpropagate to populate the gradients
+        if output_mode == "classification":
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+        elif output_mode == "regression":
+            loss_fct = MSELoss()
+            loss = loss_fct(logits.view(-1), label_ids.view(-1))
+        loss.backward()
+        # Second compute importance scores according to http://arxiv.org/abs/1905.10650
+        multihead_outputs = model.bert.get_multihead_outputs()
+        for layer, mh_layer_output in enumerate(multihead_outputs):
+            dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
+            head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
+
+        tot_tokens += input_mask.float().detach().sum().data
+
+    # Normalize
+    attn_entropy /= tot_tokens
+    head_importance /= tot_tokens
+    if args.normalize_importance:
+        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
+
+    return attn_entropy, head_importance
+
 def run_model():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased', help='pretrained model name or path to local checkpoint')
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
+    parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
+    parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
+
+    parser.add_argument("--normalize_importance", action='store_true', help="Whether to normalize importance score between 0 and 1")
+
+    parser.add_argument("--try_pruning", action='store_true', help="Whether to try to prune head until a threshold of accuracy.")
+    parser.add_argument("--pruning_threshold", default=0.9, type=float, help="Pruning threshold of accuracy.")
+
+    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
+
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
     parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
     args = parser.parse_args()
 
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    torch.cuda.manual_seed(args.seed)
-
+    # Setup devices and distributed training
     if args.local_rank == -1 or args.no_cuda:
         args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         n_gpu = torch.cuda.device_count()
@@ -34,21 +110,107 @@ def run_model():
         torch.cuda.set_device(args.local_rank)
         args.device = torch.device("cuda", args.local_rank)
         n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
 
+    # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        args.device, n_gpu, bool(args.local_rank != -1), args.fp16))
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
 
+    # Set seeds
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed(args.seed)
+
+    # Prepare GLUE task
+    task_name = args.task_name.lower()
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Prepare output directory
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    # Load model & tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
     tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
-    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path)
+
+    # Load a model with all BERTology options on:
+    #   output_attentions => will output attention weights
+    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
+    #       see: http://arxiv.org/abs/1905.10650
+    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
+                                                          num_labels=num_labels,
+                                                          output_attentions=True,
+                                                          keep_multihead_output=True)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
     model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
     model.eval()
 
-    
+    # Prepare dataset for the GLUE task
+    eval_examples = processor.get_dev_examples(args.data_dir)
+    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
+    try:
+        eval_features = torch.load(cached_eval_features_file)
+    except:
+        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving eval features to cache file %s", cached_eval_features_file)
+            torch.save(eval_features, cached_eval_features_file)
 
-if __name__ == '__main__':
-    run_model()
+    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if output_mode == "classification" else torch.float)
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+
+    if args.data_subset > 0:
+        eval_data = Subset(eval_data, list(range(args.data_subset)))
+
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    # Print/save training arguments
+    print(args)
+    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+
+    # To showcase some BERTology methods, we will compute:
+    #   - the average entropy of each head over the dev set
+    #   - the importance score of each head over the dev set as explained in http://arxiv.org/abs/1905.10650
+    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+    tot_tokens = 0.0
 
+    # Compute head entropy and importance score
+    attn_entropy, head_importance = compute_heads_importance(args, model, eval_dataloader)
 
+    # Print/save matrices
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy)
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance)
+
+    logger.info("Attention entropies")
+    print_2d_tensor(attn_entropy)
+    logger.info("Head importance scores")
+    print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(n_layers * n_heads, dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel())
+    print_2d_tensor(head_ranks.view_as(head_importance))
+
+    # Do pruning if we want to
+    if args.try_pruning and args.pruning_threshold > 0.0 and args.pruning_threshold < 1.0:
+        
+        
+
+if __name__ == '__main__':
+    run_model()
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index eda96f81e316d2..0885d7b1453df0 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -366,7 +366,7 @@ def main():
         output_args_file = os.path.join(args.output_dir, 'training_args.bin')
         torch.save(args, output_args_file)
     else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model)
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
 
     model.to(device)
 
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index d7493f07ca5d00..f5156d7d956801 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -707,36 +707,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 archive_file, resolved_archive_file))
             logger.info("loading configuration file {} from cache at {}".format(
                 config_file, resolved_config_file))
-        ### Switching to split config/weight files configuration
-        # tempdir = None
-        # if os.path.isdir(resolved_archive_file) or from_tf:
-        #     serialization_dir = resolved_archive_file
-        # else:
-        #     # Extract archive to temp dir
-        #     tempdir = tempfile.mkdtemp()
-        #     logger.info("extracting archive file {} to temp dir {}".format(
-        #         resolved_archive_file, tempdir))
-        #     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-        #         archive.extractall(tempdir)
-        #     serialization_dir = tempdir
-        # config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        # if not os.path.exists(config_file):
-        #     # Backward compatibility with old naming format
-        #     config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
         # Load config
         config = BertConfig.from_json_file(resolved_config_file)
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            # weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        # if tempdir:
-        #     # Clean up temp dir
-        #     shutil.rmtree(tempdir)
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            # weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
             return load_tf_weights_in_bert(model, weights_path)
         # Load from a PyTorch state_dict
         old_keys = []
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 1aa4c01bde4ec3..d37165d888135d 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -37,6 +37,9 @@
     'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-base-uncased': 512,
@@ -49,6 +52,9 @@
     'bert-base-german-cased': 512,
     'bert-large-uncased-whole-word-masking': 512,
     'bert-large-cased-whole-word-masking': 512,
+    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
+    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
+    'bert-base-cased-finetuned-mrpc': 512,
 }
 VOCAB_NAME = 'vocab.txt'
 

From 34d706a0e18b42927749b78e41347dcfadc27458 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 15:25:49 +0200
Subject: [PATCH 134/144] pruning in bertology

---
 README.md                                  |   6 +-
 examples/bertology.py                      | 161 +++++++++++++++------
 examples/run_classifier.py                 |   6 +-
 pytorch_pretrained_bert/modeling.py        |   8 +-
 pytorch_pretrained_bert/modeling_gpt2.py   |   5 +-
 pytorch_pretrained_bert/modeling_openai.py |   5 +-
 tests/modeling_gpt2_test.py                |   6 +-
 tests/modeling_openai_test.py              |   6 +-
 tests/modeling_test.py                     |   6 +-
 9 files changed, 140 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 287cc207e18e8d..5d5148d529565a 100644
--- a/README.md
+++ b/README.md
@@ -724,7 +724,7 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs* a tuple composed of:
 
@@ -859,7 +859,7 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
@@ -960,7 +960,7 @@ We detail them here. This model takes as *inputs*:
     You can use it to add a third type of embedding to each input token in the sequence
     (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
 - `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
diff --git a/examples/bertology.py b/examples/bertology.py
index f7aa4b99708720..888d95e5c64007 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -2,6 +2,7 @@
 import os
 import argparse
 import logging
+from datetime import timedelta, datetime
 from tqdm import tqdm
 
 import numpy as np
@@ -35,38 +36,56 @@ def print_2d_tensor(tensor):
     for row in range(len(tensor)):
         print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
 
-def compute_heads_importance(args, model, eval_dataloader):
+def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
     """ Example on how to use model outputs to compute:
         - head attention entropy (activated by setting output_attentions=True when we created the model
         - head importance scores according to http://arxiv.org/abs/1905.10650
             (activated by setting keep_multihead_output=True when we created the model)
     """
+    # Prepare our tensors
+    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+    preds = None
+    labels = None
+    tot_tokens = 0.0
+
     for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
         batch = tuple(t.to(args.device) for t in batch)
         input_ids, input_mask, segment_ids, label_ids = batch
 
-        # Do a forward pass
-        all_attentions, logits = model(input_ids, segment_ids, input_mask)
-
-        # Update head attention entropy
-        for layer, attn in enumerate(all_attentions):
-            masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
-            attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
-
-        # Update head importance scores with regards to our loss
-        # First backpropagate to populate the gradients
-        if output_mode == "classification":
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-        elif output_mode == "regression":
-            loss_fct = MSELoss()
-            loss = loss_fct(logits.view(-1), label_ids.view(-1))
-        loss.backward()
-        # Second compute importance scores according to http://arxiv.org/abs/1905.10650
-        multihead_outputs = model.bert.get_multihead_outputs()
-        for layer, mh_layer_output in enumerate(multihead_outputs):
-            dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
-            head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
+        # Do a forward pass (not in torch.no_grad() since we need gradients for importance score - see below)
+        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
+
+        if compute_entropy:
+            # Update head attention entropy
+            for layer, attn in enumerate(all_attentions):
+                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
+
+        if compute_importance:
+            # Update head importance scores with regards to our loss
+            # First, backpropagate to populate the gradients
+            if args.output_mode == "classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+            elif args.output_mode == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), label_ids.view(-1))
+            loss.backward()
+            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650
+            multihead_outputs = model.bert.get_multihead_outputs()
+            for layer, mh_layer_output in enumerate(multihead_outputs):
+                dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
+                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
+
+        # Also store our logits/labels if we want to compute metrics afterwards
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            labels = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
 
         tot_tokens += input_mask.float().detach().sum().data
 
@@ -76,7 +95,7 @@ def compute_heads_importance(args, model, eval_dataloader):
     if args.normalize_importance:
         head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
 
-    return attn_entropy, head_importance
+    return attn_entropy, head_importance, preds, labels
 
 def run_model():
     parser = argparse.ArgumentParser()
@@ -89,8 +108,11 @@ def run_model():
 
     parser.add_argument("--normalize_importance", action='store_true', help="Whether to normalize importance score between 0 and 1")
 
-    parser.add_argument("--try_pruning", action='store_true', help="Whether to try to prune head until a threshold of accuracy.")
-    parser.add_argument("--pruning_threshold", default=0.9, type=float, help="Pruning threshold of accuracy.")
+    parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
+    parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
+                                                                             "(stop masking when metric < threshold * original metric value).")
+    parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.")
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
 
     parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
                              "Sequences longer than this will be truncated, and sequences shorter \n"
@@ -125,9 +147,9 @@ def run_model():
     # Prepare GLUE task
     task_name = args.task_name.lower()
     processor = processors[task_name]()
-    output_mode = output_modes[task_name]
     label_list = processor.get_labels()
-    num_labels = len(label_list)
+    args.output_mode = output_modes[task_name]
+    args.num_labels = len(label_list)
 
     # Prepare output directory
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
@@ -145,7 +167,7 @@ def run_model():
     #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
     #       see: http://arxiv.org/abs/1905.10650
     model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
-                                                          num_labels=num_labels,
+                                                          num_labels=args.num_labels,
                                                           output_attentions=True,
                                                           keep_multihead_output=True)
     if args.local_rank == 0:
@@ -162,7 +184,7 @@ def run_model():
     try:
         eval_features = torch.load(cached_eval_features_file)
     except:
-        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)
         if args.local_rank in [-1, 0]:
             logger.info("Saving eval features to cache file %s", cached_eval_features_file)
             torch.save(eval_features, cached_eval_features_file)
@@ -170,7 +192,7 @@ def run_model():
     all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if output_mode == "classification" else torch.float)
+    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float)
     eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
 
     if args.data_subset > 0:
@@ -183,16 +205,8 @@ def run_model():
     print(args)
     torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
 
-    # To showcase some BERTology methods, we will compute:
-    #   - the average entropy of each head over the dev set
-    #   - the importance score of each head over the dev set as explained in http://arxiv.org/abs/1905.10650
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-    tot_tokens = 0.0
-
     # Compute head entropy and importance score
-    attn_entropy, head_importance = compute_heads_importance(args, model, eval_dataloader)
+    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
 
     # Print/save matrices
     np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy)
@@ -203,14 +217,67 @@ def run_model():
     logger.info("Head importance scores")
     print_2d_tensor(head_importance)
     logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(n_layers * n_heads, dtype=torch.long, device=args.device)
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
     head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel())
-    print_2d_tensor(head_ranks.view_as(head_importance))
-
-    # Do pruning if we want to
-    if args.try_pruning and args.pruning_threshold > 0.0 and args.pruning_threshold < 1.0:
-        
-        
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+
+    # Do masking if we want to
+    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
+        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
+        logger.info("Pruning: original score: %f", original_score)
+
+        new_head_mask = torch.ones_like(head_importance)
+        num_to_mask = int(new_head_mask.numel() * args.masking_amount)
+
+        current_score = original_score
+        while current_score >= original_score * args.masking_threshold:
+            head_mask = new_head_mask
+            # heads from most important to least
+            heads_to_mask = head_importance.view(-1).sort(descending=True)[1]
+            # keep only not-masked heads
+            heads_to_mask = heads_to_mask[head_mask.view(-1).nonzero()][:, 0]
+
+            if len(heads_to_mask) <= num_to_mask:
+                break
+
+            # mask heads
+            heads_to_mask = heads_to_mask[-num_to_mask:]
+            new_head_mask = head_mask.view(-1)
+            new_head_mask[heads_to_mask] = 0.0
+            new_head_mask = new_head_mask.view_as(head_importance)
+            print_2d_tensor(new_head_mask)
+
+            # Compute metric and head importance again
+            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+            preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
+            logger.info("Masking: current score: %f, remaning heads %.1f percents", current_score, head_mask.sum()/head_mask.numel() * 100)
+
+        # Try pruning and test time speedup
+        # Pruning is like masking but we actually remove the masked weights
+        before_time = datetime.now()
+        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
+        original_time = datetime.now() - before_time
+
+        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+        model.bert.prune_heads(heads_to_prune)
+
+        before_time = datetime.now()
+        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                       compute_entropy=False, compute_importance=False, head_mask=None)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
+        new_time = datetime.now() - before_time
+
+        logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+        logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
 
 if __name__ == '__main__':
     run_model()
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 0885d7b1453df0..5a359ad2622712 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -308,7 +308,7 @@ def main():
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, segment_ids, input_mask)
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
@@ -422,7 +422,7 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, segment_ids, input_mask)
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
             # create eval loss and other metric required by the task
             if output_mode == "classification":
@@ -503,7 +503,7 @@ def main():
                 label_ids = label_ids.to(device)
 
                 with torch.no_grad():
-                    logits = model(input_ids, segment_ids, input_mask, labels=None)
+                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
 
                 loss_fct = CrossEntropyLoss()
                 tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index f5156d7d956801..409956a141381b 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -408,6 +408,8 @@ def __init__(self, config, output_attentions=False, keep_multihead_output=False)
         self.output = BertSelfOutput(config)
 
     def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
         mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
         for head in heads:
             mask[head] = 0
@@ -858,9 +860,10 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
-        # 1.0 in head_mask indicate we mask the head
+        # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape num_hidden_layers x batch x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
@@ -868,7 +871,6 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_al
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-            head_mask = (1.0 - head_mask)
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index dd195fc8806bb1..0a51af811893a4 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -264,6 +264,8 @@ def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
     def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
         for head in heads:
             mask[head] = 0
@@ -714,7 +716,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         # Prepare head mask if needed
-        # 1.0 in head_mask indicate we mask the head
+        # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
         # head_mask has shape n_layer x batch x n_heads x N x N
         if head_mask is not None:
@@ -724,7 +726,6 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-            head_mask = (1.0 - head_mask)
         else:
             head_mask = [None] * self.config.n_layer
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 91848f3c6804e2..f02f016ef1cb58 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -274,6 +274,8 @@ def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
     def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
         for head in heads:
             mask[head] = 0
@@ -710,7 +712,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         # Prepare head mask if needed
-        # 1.0 in head_mask indicate we mask the head
+        # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
         # head_mask has shape n_layer x batch x n_heads x N x N
         if head_mask is not None:
@@ -720,7 +722,6 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-            head_mask = (1.0 - head_mask)
         else:
             head_mask = [None] * self.config.n_layer
 
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 122b9c7913ccfa..589de22f5d93a8 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -215,9 +215,9 @@ def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_id
             for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
                 model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
                 if isinstance(model, GPT2DoubleHeadsModel):
                     output = model(input_ids, mc_token_ids, head_mask=head_mask)
                 else:
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index adb3671ef5f033..c8fc8f48fc3be9 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -188,9 +188,9 @@ def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_
             for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
                 model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
                 if isinstance(model, OpenAIGPTDoubleHeadsModel):
                     output = model(input_ids, mc_token_ids, head_mask=head_mask)
                 else:
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 80bb3d3c95688e..126c6fad13a32e 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -305,9 +305,9 @@ def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_id
                 else:
                     model = model_class(config=config, keep_multihead_output=True)
                 model.eval()
-                head_mask = torch.zeros(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
-                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                head_mask = torch.ones(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
+                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
                 output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
 
                 if isinstance(model, BertModel):

From 14f0e8e55734456b71ad8b3c7d94e9d006d7fb8d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 15:29:28 +0200
Subject: [PATCH 135/144] fix cuda

---
 examples/bertology.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/bertology.py b/examples/bertology.py
index 888d95e5c64007..6997b9e26dedcd 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -209,8 +209,8 @@ def run_model():
     attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
 
     # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy)
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance)
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
 
     logger.info("Attention entropies")
     print_2d_tensor(attn_entropy)

From 909d4f1af24d354a6e1a45ca6cf58d30fc0fcc07 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 15:32:10 +0200
Subject: [PATCH 136/144] cuda again

---
 examples/bertology.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/bertology.py b/examples/bertology.py
index 6997b9e26dedcd..694328162a4447 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -218,7 +218,7 @@ def run_model():
     print_2d_tensor(head_importance)
     logger.info("Head ranked by importance scores")
     head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel())
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
     head_ranks = head_ranks.view_as(head_importance)
     print_2d_tensor(head_ranks)
 

From 0e1e8128bf98d1154712d5725b8c44090821d26c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 15:35:49 +0200
Subject: [PATCH 137/144] more logging

---
 examples/bertology.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/bertology.py b/examples/bertology.py
index 694328162a4447..af7e661a5282f3 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -227,7 +227,7 @@ def run_model():
         _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
         preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
         original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-        logger.info("Pruning: original score: %f", original_score)
+        logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
 
         new_head_mask = torch.ones_like(head_importance)
         num_to_mask = int(new_head_mask.numel() * args.masking_amount)
@@ -245,6 +245,7 @@ def run_model():
 
             # mask heads
             heads_to_mask = heads_to_mask[-num_to_mask:]
+            logger.info("Heads to mask: %s", str(heads_to_mask.tolist()))
             new_head_mask = head_mask.view(-1)
             new_head_mask[heads_to_mask] = 0.0
             new_head_mask = new_head_mask.view_as(head_importance)
@@ -254,7 +255,7 @@ def run_model():
             _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
             preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
             current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-            logger.info("Masking: current score: %f, remaning heads %.1f percents", current_score, head_mask.sum()/head_mask.numel() * 100)
+            logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
 
         # Try pruning and test time speedup
         # Pruning is like masking but we actually remove the masked weights

From 0f40e8d6a63656fbdbb3a31f91c0eb7c1168d2a8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 15:38:46 +0200
Subject: [PATCH 138/144] debugger

---
 examples/bertology.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/bertology.py b/examples/bertology.py
index af7e661a5282f3..7efdfa459a5d45 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -122,8 +122,17 @@ def run_model():
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
     parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
     # Setup devices and distributed training
     if args.local_rank == -1 or args.no_cuda:
         args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

From e4b46d86ce0cbcbc9011375add7f3713eb5ef967 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 22:16:30 +0200
Subject: [PATCH 139/144] update head pruning

---
 examples/bertology.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/examples/bertology.py b/examples/bertology.py
index 7efdfa459a5d45..8c3fcfc443b46a 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -92,7 +92,13 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     # Normalize
     attn_entropy /= tot_tokens
     head_importance /= tot_tokens
-    if args.normalize_importance:
+    # Layerwise importance normalization
+    if not args.dont_normalize_importance_by_layer:
+        exponent = 2
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
+
+    if not args.dont_normalize_global_importance:
         head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
 
     return attn_entropy, head_importance, preds, labels
@@ -106,7 +112,8 @@ def run_model():
     parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
     parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
 
-    parser.add_argument("--normalize_importance", action='store_true', help="Whether to normalize importance score between 0 and 1")
+    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers")
+    parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1")
 
     parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
     parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
@@ -243,21 +250,20 @@ def run_model():
 
         current_score = original_score
         while current_score >= original_score * args.masking_threshold:
-            head_mask = new_head_mask
-            # heads from most important to least
-            heads_to_mask = head_importance.view(-1).sort(descending=True)[1]
-            # keep only not-masked heads
-            heads_to_mask = heads_to_mask[head_mask.view(-1).nonzero()][:, 0]
+            head_mask = new_head_mask  # save current head mask
+            # heads from most important to least - keep only not-masked heads
+            head_importance = head_importance.view(-1)[head_mask.view(-1).nonzero()][:, 0]
+            current_heads_to_mask = head_importance.sort()[1]
 
-            if len(heads_to_mask) <= num_to_mask:
+            if len(current_heads_to_mask) <= num_to_mask:
                 break
 
             # mask heads
-            heads_to_mask = heads_to_mask[-num_to_mask:]
-            logger.info("Heads to mask: %s", str(heads_to_mask.tolist()))
+            current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+            logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
             new_head_mask = head_mask.view(-1)
-            new_head_mask[heads_to_mask] = 0.0
-            new_head_mask = new_head_mask.view_as(head_importance)
+            new_head_mask[current_heads_to_mask] = 0.0
+            new_head_mask = new_head_mask.view_as(head_mask)
             print_2d_tensor(new_head_mask)
 
             # Compute metric and head importance again

From 7f00a36e27ce89dea736a2bf52cc3073cf7911ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 22:23:12 +0200
Subject: [PATCH 140/144] pruning should keep on device

---
 pytorch_pretrained_bert/modeling.py      | 2 +-
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 409956a141381b..c0ab2dd681b487 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -80,7 +80,7 @@ def prune_linear_layer(layer, index, dim=0):
             b = layer.bias[index].clone().detach()
     new_size = list(layer.weight.size())
     new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
     new_layer.weight.requires_grad = False
     new_layer.weight.copy_(W.contiguous())
     new_layer.weight.requires_grad = True
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 0a51af811893a4..c17a7e7d268963 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -55,7 +55,7 @@ def prune_conv1d_layer(layer, index, dim=1):
         b = layer.bias[index].clone().detach()
     new_size = list(layer.weight.size())
     new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0])
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
     new_layer.weight.requires_grad = False
     new_layer.weight.copy_(W.contiguous())
     new_layer.weight.requires_grad = True

From 7766ce66dd788a780a54f890711ad58cb3d96f5f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 22:29:51 +0200
Subject: [PATCH 141/144] update bertology

---
 examples/bertology.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/bertology.py b/examples/bertology.py
index 8c3fcfc443b46a..bf32e8e17479da 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -281,9 +281,11 @@ def run_model():
         score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
         original_time = datetime.now() - before_time
 
+        original_num_params = sum(p.numel() for p in model.parameters())
         heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
         assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
         model.bert.prune_heads(heads_to_prune)
+        pruned_num_params = sum(p.numel() for p in model.parameters())
 
         before_time = datetime.now()
         _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
@@ -292,6 +294,7 @@ def run_model():
         score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
         new_time = datetime.now() - before_time
 
+        logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
         logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
         logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
 

From edfe91c36e3e322b2d19424f952dbc0a33021f03 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 23:43:04 +0200
Subject: [PATCH 142/144] first version bertology ok

---
 examples/bertology.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/examples/bertology.py b/examples/bertology.py
index bf32e8e17479da..4bb23b8f168324 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -25,17 +25,20 @@ def entropy(p):
     plogp[p == 0] = 0
     return -plogp.sum(dim=-1)
 
+
 def print_1d_tensor(tensor, prefix=""):
     if tensor.dtype != torch.long:
         logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
     else:
         logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
 
+
 def print_2d_tensor(tensor):
     logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
     for row in range(len(tensor)):
         print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
 
+
 def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
     """ Example on how to use model outputs to compute:
         - head attention entropy (activated by setting output_attentions=True when we created the model
@@ -54,7 +57,7 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
         batch = tuple(t.to(args.device) for t in batch)
         input_ids, input_mask, segment_ids, label_ids = batch
 
-        # Do a forward pass (not in torch.no_grad() since we need gradients for importance score - see below)
+        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
         all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
 
         if compute_entropy:
@@ -103,6 +106,7 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
 
     return attn_entropy, head_importance, preds, labels
 
+
 def run_model():
     parser = argparse.ArgumentParser()
     parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
@@ -212,7 +216,7 @@ def run_model():
     eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
 
     if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(args.data_subset)))
+        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
 
     eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
@@ -246,14 +250,14 @@ def run_model():
         logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
 
         new_head_mask = torch.ones_like(head_importance)
-        num_to_mask = int(new_head_mask.numel() * args.masking_amount)
+        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
 
         current_score = original_score
         while current_score >= original_score * args.masking_threshold:
-            head_mask = new_head_mask  # save current head mask
-            # heads from most important to least - keep only not-masked heads
-            head_importance = head_importance.view(-1)[head_mask.view(-1).nonzero()][:, 0]
-            current_heads_to_mask = head_importance.sort()[1]
+            head_mask = new_head_mask.clone() # save current head mask
+            # heads from least important to most - keep only not-masked heads
+            head_importance[head_mask == 0.0] = float('Inf')
+            current_heads_to_mask = head_importance.view(-1).sort()[1]
 
             if len(current_heads_to_mask) <= num_to_mask:
                 break
@@ -261,7 +265,7 @@ def run_model():
             # mask heads
             current_heads_to_mask = current_heads_to_mask[:num_to_mask]
             logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-            new_head_mask = head_mask.view(-1)
+            new_head_mask = new_head_mask.view(-1)
             new_head_mask[current_heads_to_mask] = 0.0
             new_head_mask = new_head_mask.view_as(head_mask)
             print_2d_tensor(new_head_mask)
@@ -272,6 +276,10 @@ def run_model():
             current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
             logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
 
+        logger.info("Final head mask")
+        print_2d_tensor(head_mask)
+        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+
         # Try pruning and test time speedup
         # Pruning is like masking but we actually remove the masked weights
         before_time = datetime.now()

From 411981a08038590ab0ad6b73f31919b6c05c161d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 20 Jun 2019 08:54:18 +0200
Subject: [PATCH 143/144] remove slow circle-ci

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 04adf715e0efb3..3a4bae2984534b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,7 +10,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow --cov
+            - run: python -m pytest -sv tests/ --cov
             - run: codecov
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
@@ -22,7 +22,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow --cov
+            - run: python -m pytest -sv tests/ --cov
             - run: codecov
 workflows:
   version: 2

From c304593d8fa93f25febe1458c63497a846749c89 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 20 Jun 2019 10:05:06 +0200
Subject: [PATCH 144/144] BERTology details in readme

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index 5d5148d529565a..2c67dc65e89140 100644
--- a/README.md
+++ b/README.md
@@ -1563,6 +1563,22 @@ python -m torch.distributed.launch --nproc_per_node=8 \
   --gradient_accumulation_steps 2
 ```
 
+## BERTology
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+
+- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+
+- accessing all the hidden-states of BERT/GPT/GPT-2,
+- accessing all the attention weights for each head of BERT/GPT/GPT-2,
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: [`bertology.py`](./examples/bertology.py) while extract information and prune a model pre-trained on MRPC.
+
 ## Notebooks
 
 We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.