Merge pull request #537 from allenai/AkshitaB-tokenizer-patch

Tokenizer patch
allenai · Apr 9, 2024 · 9d40898 · 9d40898
2 parents 62c7954 + ecfe090
commit 9d40898
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added commonsense_qa and social_iqa downstream evaluation tasks
 - Makes it possible to read from http/https the same way we read from s3/r2.
 - Added MMLU multiple choice (A/B/C/D) 5-shot variant downstream tasks
+- Tokenizer patch
 
 ### Changed
 

diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py
@@ -4,6 +4,7 @@
 import shutil
 
 import torch
+from omegaconf import OmegaConf as om
 
 from hf_olmo.configuration_olmo import OLMoConfig
 from hf_olmo.modeling_olmo import OLMoForCausalLM
@@ -91,6 +92,14 @@ def download_remote_checkpoint_and_convert_to_hf(checkpoint_dir: str, local_dir:
     return local_model_path
 
 
+def fix_bad_tokenizer(checkpoint_dir: str):
+    path = os.path.join(checkpoint_dir, "config.yaml")
+    conf = om.load(path)
+    conf["tokenizer"]["identifier"] = "allenai/gpt-neox-olmo-dolma-v1_5"
+    conf["model"]["eos_token_id"] = 50279
+    om.save(conf, path)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Adds a config.json to the checkpoint directory, and creates pytorch_model.bin, "
@@ -109,6 +118,7 @@ def main():
     )
 
     args = parser.parse_args()
+    fix_bad_tokenizer(args.checkpoint_dir)
     convert_checkpoint(args.checkpoint_dir, args.ignore_olmo_compatibility)