Skip to content

Commit

Permalink
GPT inference long context (#6687)
Browse files Browse the repository at this point in the history
* deb infer

Signed-off-by: Evelina <[email protected]>

* deb infer

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* dont do maxlen trunc for non abs pos emb

Signed-off-by: Evelina <[email protected]>

* dont do maxlen trunc for non abs pos emb

Signed-off-by: Evelina <[email protected]>

* convert for training only

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add eval test, add save .nemo for sft model

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* jenkins format fix

Signed-off-by: Evelina <[email protected]>

* update jenkins

Signed-off-by: Evelina <[email protected]>

* update jenkins

Signed-off-by: Evelina <[email protected]>

* fix jenkins

Signed-off-by: Evelina <[email protected]>

* remove test, ci timeout

Signed-off-by: Evelina <[email protected]>

* fix for m_gpt_eval.py

Signed-off-by: Evelina <[email protected]>

* jenkins test

Signed-off-by: Evelina <[email protected]>

* fix gpt_eval with sft model

Signed-off-by: Evelina <[email protected]>

* revert jenkins

Signed-off-by: Evelina <[email protected]>

* keep float conversion for model.generate()

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix inference dtype

Signed-off-by: Evelina <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Evelina <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ekmb and pre-commit-ci[bot] authored Jun 2, 2023
1 parent a420f90 commit 9827c9b
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 15 deletions.
25 changes: 24 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -3407,7 +3407,30 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.num_nodes=1"
}
}

stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps{
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
model.peft.restore_from_path=null \
model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
model.data.test_ds.names=['test'] \
model.data.test_ds.global_batch_size=1 \
model.data.test_ds.micro_batch_size=1 \
model.data.test_ds.tokens_to_generate=30 \
model.data.test_ds.max_seq_length=6000 \
inference.greedy=True \
inference.repetition_penalty=1.0 \
inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \
rm -rf examples/nlp/language_modeling/out.jsonl"
}
}
stage('L2: Megatron GPT Prompt Tuning TP1 PP1') {
when {
anyOf {
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_gpt_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def main(cfg) -> None:
print(response)
print("***************************")

# Second method of running text generation, call trainer.predict
# Second method of running text generation, call trainer.predict [recommended]
ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
request_dl = DataLoader(dataset=ds, batch_size=2)
config = OmegaConf.to_container(cfg.inference)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,4 @@ inference:
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
outfile_path: /home/adithyare/exp/foo.txt
outfile_path: output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ exp_manager:
monitor: validation_${model.data.validation_ds.metric.name}
save_top_k: 2
mode: max
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below,
save_nemo_on_train_end: False
filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
Expand Down
7 changes: 7 additions & 0 deletions examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0)
gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
sft_cls = MegatronGPTSFTModel
gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"

# This is needed when modifying a hparam file directly to load `.ckpt` files.
# This is not needed to modify the cfg in `.nemo` files.
Expand Down Expand Up @@ -167,6 +169,10 @@ def main(cfg) -> None:

trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision

if cfg.model.restore_from_path:
save_restore_connector = NLPSaveRestoreConnector()
if os.path.isdir(cfg.model.restore_from_path):
Expand All @@ -177,6 +183,7 @@ def main(cfg) -> None:
return_config=True,
save_restore_connector=save_restore_connector,
)
gpt_cfg = _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False)
model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config)
else:
validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,9 @@ def _process_example(self, example):
tokenized_text = pre_pad + self.tokenizer.text_to_ids(text)
context_ids = pre_pad + self.tokenizer.text_to_ids(context)
answer_ids = tokenized_text[len(context_ids) :]
total_ids = len(context_ids) + len(answer_ids)

# for the long context cases, collate_fn includes self.tokens_to_generate for padding
total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
if self.add_bos:
total_ids += 1
if self.add_sep:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,13 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
return compute_prob_response
else:
del inference_config['compute_logprob']
inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda())

# for megatron_gpt_eval.py
if isinstance(batch, list):
inference_config['inputs'] = batch
else:
# peft_eval.py
inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda())
return generate(self, **inference_config)

def write_predictions_to_file(self, outputs, output_file_path_prefix):
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/nlp/modules/common/megatron/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def forward(self, *inputs, **kwargs):
if getattr(self.module, 'pre_process', True):
inputs = fp32_to_float16(inputs, self.float16_converter)
outputs = self.module(*inputs, **kwargs)
if parallel_state.is_pipeline_last_stage():
if parallel_state.is_pipeline_last_stage() and self.training:
outputs = float16_to_fp32(outputs)
return outputs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,11 @@ def __init__(self, model):

def clip_max_len(self, maxlen: int) -> int:
""" clip the max len based on the LM model max sequence length"""
if maxlen > self.model.cfg.encoder_seq_length + 1:
maxlen = self.model.cfg.encoder_seq_length + 1

# for positional embedding types that allow length extrapolation, don't clip the max length
if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
if maxlen > self.model.cfg.encoder_seq_length + 1:
maxlen = self.model.cfg.encoder_seq_length + 1
return maxlen

def init_batch(self, context_tokens: torch.Tensor, context_length: int):
Expand Down
18 changes: 12 additions & 6 deletions nemo/collections/nlp/modules/common/text_generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,9 +421,15 @@ def synced_generate(
if parallel_state.is_pipeline_first_stage():
src = parallel_state.get_pipeline_model_parallel_last_rank()
group = parallel_state.get_embedding_group()
output_logits = torch.empty(
tokens.size(0), context_length - 1, dtype=torch.float32, device=torch.device("cuda")
)

precision = model._trainer.precision
if precision in [16, "16"]:
dtype = torch.float16
elif precision == "bf16":
dtype = torch.bfloat16
else:
dtype = torch.float32
output_logits = torch.empty(tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda"))
torch.distributed.broadcast(output_logits, src, group)

if all_probs:
Expand All @@ -433,7 +439,7 @@ def synced_generate(
tokens.size(0),
context_length - 1,
model.padded_vocab_size,
dtype=torch.float32,
dtype=dtype,
device=torch.device("cuda"),
)
torch.distributed.broadcast(full_logits, src, group)
Expand Down Expand Up @@ -667,10 +673,10 @@ def sample_sequence_batch(
output = inference_strategy.forward_step(batch, tensor_shape)

if parallel_state.is_pipeline_last_stage():
output = output[0]['logits'].float()
output = output[0]['logits']

output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
assert output is not None
output = output.float()
logits = output[:, -1].view(batch_size, -1).contiguous()

# make sure it will generate at least min_length
Expand Down

0 comments on commit 9827c9b

Please sign in to comment.