Skip to content

Commit

Permalink
Merge pull request #2 from huggingface/master
Browse files Browse the repository at this point in the history
Updating my fork to the latest version
  • Loading branch information
Rocketknight1 committed Jun 22, 2019
2 parents b8e2a9c + c304593 commit 7c59e32
Show file tree
Hide file tree
Showing 42 changed files with 4,856 additions and 2,006 deletions.
13 changes: 8 additions & 5 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@ jobs:
steps:
- checkout
- run: sudo pip install --progress-bar off .
- run: sudo pip install pytest ftfy spacy
- run: sudo pip install pytest codecov pytest-cov
- run: sudo pip install spacy ftfy==4.4.3
- run: sudo python -m spacy download en
- run: python -m pytest -sv tests/ --runslow
- run: python -m pytest -sv tests/ --cov
- run: codecov
build_py2:
working_directory: ~/pytorch-pretrained-BERT
docker:
- image: circleci/python:2.7
steps:
- checkout
- run: sudo pip install --progress-bar off .
- run: sudo pip install pytest spacy
- run: sudo pip install ftfy==4.4.3
- run: sudo pip install pytest codecov pytest-cov
- run: sudo pip install spacy ftfy==4.4.3
- run: sudo python -m spacy download en
- run: python -m pytest -sv tests/ --runslow
- run: python -m pytest -sv tests/ --cov
- run: codecov
workflows:
version: 2
build_and_test:
Expand Down
8 changes: 8 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[run]
source=pytorch_pretrained_bert
[report]
exclude_lines =
pragma: no cover
raise
except
register_parameter
284 changes: 269 additions & 15 deletions README.md

Large diffs are not rendered by default.

Binary file added docs/imgs/warmup_constant_schedule.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/imgs/warmup_cosine_schedule.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/imgs/warmup_linear_schedule.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
310 changes: 310 additions & 0 deletions examples/bertology.py

Large diffs are not rendered by default.

19 changes: 13 additions & 6 deletions examples/lm_finetuning/finetune_on_pregenerated.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from argparse import ArgumentParser
from pathlib import Path
import os
import torch
import logging
import json
Expand All @@ -12,9 +13,10 @@
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm

from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")

Expand Down Expand Up @@ -268,7 +270,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
Expand Down Expand Up @@ -314,8 +317,7 @@ def main():
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps,
args.warmup_proportion)
lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
Expand All @@ -325,8 +327,13 @@ def main():
# Save a trained model
logging.info("** ** * Saving fine-tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = args.output_dir / "pytorch_model.bin"
torch.save(model_to_save.state_dict(), str(output_model_file))

output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(args.output_dir)


if __name__ == '__main__':
Expand Down
84 changes: 60 additions & 24 deletions examples/lm_finetuning/pregenerate_training_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from tempfile import TemporaryDirectory
import shelve

from random import random, randrange, randint, shuffle, choice, sample
from random import random, randrange, randint, shuffle, choice
from pytorch_pretrained_bert.tokenization import BertTokenizer
import numpy as np
import json

import collections

class DocumentDatabase:
def __init__(self, reduce_memory=False):
Expand Down Expand Up @@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
else:
trunc_tokens.pop()

MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])

def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
with several refactors to clean it up and remove a lot of unnecessary variables."""
cand_indices = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indices.append(i)
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word. When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
cand_indices[-1].append(i)
else:
cand_indices.append([i])

num_to_mask = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
shuffle(cand_indices)
mask_indices = sorted(sample(cand_indices, num_to_mask))
masked_token_labels = []
for index in mask_indices:
# 80% of the time, replace with [MASK]
if random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
masked_lms = []
covered_indexes = set()
for index_set in cand_indices:
if len(masked_lms) >= num_to_mask:
break
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if len(masked_lms) + len(index_set) > num_to_mask:
continue
is_any_index_covered = False
for index in index_set:
if index in covered_indexes:
is_any_index_covered = True
break
if is_any_index_covered:
continue
for index in index_set:
covered_indexes.add(index)

masked_token = None
# 80% of the time, replace with [MASK]
if random() < 0.8:
masked_token = "[MASK]"
else:
masked_token = choice(vocab_list)
masked_token_labels.append(tokens[index])
# Once we've saved the true label for that token, we can overwrite it with the masked version
tokens[index] = masked_token
# 10% of the time, keep original
if random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = choice(vocab_list)
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
tokens[index] = masked_token

assert len(masked_lms) <= num_to_mask
masked_lms = sorted(masked_lms, key=lambda x: x.index)
mask_indices = [p.index for p in masked_lms]
masked_token_labels = [p.label for p in masked_lms]

return tokens, mask_indices, masked_token_labels


def create_instances_from_document(
doc_database, doc_idx, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_list):
masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo.
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
Expand Down Expand Up @@ -213,7 +248,7 @@ def create_instances_from_document(
segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]

tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_list)
tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)

instance = {
"tokens": tokens,
Expand All @@ -235,9 +270,10 @@ def main():
parser.add_argument("--output_dir", type=Path, required=True)
parser.add_argument("--bert_model", type=str, required=True,
choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
"bert-base-multilingual", "bert-base-chinese"])
"bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
parser.add_argument("--do_lower_case", action="store_true")

parser.add_argument("--do_whole_word_mask", action="store_true",
help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument("--reduce_memory", action="store_true",
help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")

Expand Down Expand Up @@ -284,7 +320,7 @@ def main():
doc_instances = create_instances_from_document(
docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
vocab_list=vocab_list)
whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
doc_instances = [json.dumps(instance) for instance in doc_instances]
for instance in doc_instances:
epoch_file.write(instance + '\n')
Expand Down
65 changes: 36 additions & 29 deletions examples/lm_finetuning/simple_lm_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
Expand Down Expand Up @@ -534,34 +535,37 @@ def main():
model = torch.nn.DataParallel(model)

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
if args.do_train:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)

optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)

global_step = 0
if args.do_train:
Expand Down Expand Up @@ -601,7 +605,7 @@ def main():
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
Expand All @@ -611,9 +615,12 @@ def main():
# Save a trained model
logger.info("** ** * Saving fine - tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
if args.do_train:
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(args.output_dir)


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
Expand Down
Loading

0 comments on commit 7c59e32

Please sign in to comment.