Skip to content

Commit

Permalink
Merge pull request #537 from allenai/AkshitaB-tokenizer-patch
Browse files Browse the repository at this point in the history
Tokenizer patch
  • Loading branch information
AkshitaB authored Apr 9, 2024
2 parents 62c7954 + ecfe090 commit 9d40898
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added commonsense_qa and social_iqa downstream evaluation tasks
- Makes it possible to read from http/https the same way we read from s3/r2.
- Added MMLU multiple choice (A/B/C/D) 5-shot variant downstream tasks
- Tokenizer patch

### Changed

Expand Down
10 changes: 10 additions & 0 deletions hf_olmo/convert_olmo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil

import torch
from omegaconf import OmegaConf as om

from hf_olmo.configuration_olmo import OLMoConfig
from hf_olmo.modeling_olmo import OLMoForCausalLM
Expand Down Expand Up @@ -91,6 +92,14 @@ def download_remote_checkpoint_and_convert_to_hf(checkpoint_dir: str, local_dir:
return local_model_path


def fix_bad_tokenizer(checkpoint_dir: str):
path = os.path.join(checkpoint_dir, "config.yaml")
conf = om.load(path)
conf["tokenizer"]["identifier"] = "allenai/gpt-neox-olmo-dolma-v1_5"
conf["model"]["eos_token_id"] = 50279
om.save(conf, path)


def main():
parser = argparse.ArgumentParser(
description="Adds a config.json to the checkpoint directory, and creates pytorch_model.bin, "
Expand All @@ -109,6 +118,7 @@ def main():
)

args = parser.parse_args()
fix_bad_tokenizer(args.checkpoint_dir)
convert_checkpoint(args.checkpoint_dir, args.ignore_olmo_compatibility)


Expand Down

0 comments on commit 9d40898

Please sign in to comment.