Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load HF datasets from olmo_data #646

Merged
merged 11 commits into from
Jul 10, 2024
  •  
  •  
  •  
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
shared memory implementation can be used by passing `use_legacy_shared_mem_impl` to `unshard.py`.
- Refactor weight initialization. IMPORTANT: this does not maintain backwards-compatibility with older configs; the jobs will still run, but may produce different outputs.
- Changed the behavior of the Lion optimizer to only record the update cosine similarity when `optimizer.record_update_metrics` is `True` in order to be consistent with the API.
- Added HF datasets into `olmo_data`, and changed downstream eval to load from the package.

### Fixed

Expand Down
2 changes: 2 additions & 0 deletions olmo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,8 @@ class TrainConfig(BaseConfig):

hf_datasets_cache_dir: Optional[str] = None
"""
Deprecated, HF datasets are now stored in `olmo_data.hf_datasets`.

Path to cache directory of HF datasets saved with `datasets.save_to_disk`.
"""

Expand Down
4 changes: 1 addition & 3 deletions olmo/eval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ def build_downstream_evaluator(
task_class = label_to_task_map[eval_cfg.label]
if isinstance(task_class, tuple):
task_class, task_kwargs = task_class
ds_eval_dataset = task_class(
tokenizer=tokenizer, datasets_cache_dir=train_config.hf_datasets_cache_dir, **task_kwargs
) # type: ignore
ds_eval_dataset = task_class(tokenizer=tokenizer, **task_kwargs) # type: ignore
data_config = eval_cfg.data
if is_unit_test:
ds_eval_sampler = None
Expand Down
95 changes: 54 additions & 41 deletions olmo/eval/downstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ def __init__(
tokenizer: Tokenizer,
dataset_path: str,
dataset_name: Union[str, Sequence[str], None] = None,
datasets_cache_dir: Optional[str] = None,
model_ctx_len: int = 2048,
split="validation",
prompts=[None], # List of prompt variants to use
Expand All @@ -183,7 +182,7 @@ def __init__(

dataset_list = []
for ds_name in dataset_names:
dataset = load_hf_dataset(self.dataset_path, ds_name, split, datasets_cache_dir)
dataset = load_hf_dataset(self.dataset_path, ds_name, split)
dataset_list.append(dataset)
self.dataset = datasets.concatenate_datasets(dataset_list)

Expand Down Expand Up @@ -400,13 +399,15 @@ class PIQA(ICLMultiChoiceTaskDataset):
metric_type = "len_norm"

def __init__(
self, tokenizer, dataset_path="piqa", dataset_name="plain_text", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="piqa",
dataset_name="plain_text",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -441,13 +442,15 @@ class HellaSwag(ICLMultiChoiceTaskDataset):
metric_type = "len_norm"

def __init__(
self, tokenizer, dataset_path="hellaswag", dataset_name=None, datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="hellaswag",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

@classmethod
Expand Down Expand Up @@ -506,14 +509,12 @@ def __init__(
tokenizer,
dataset_path="winogrande",
dataset_name="winogrande_xl",
datasets_cache_dir: Optional[str] = None,
):
# all winogrande datasets have same val set
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def prep_examples(self):
Expand Down Expand Up @@ -604,13 +605,15 @@ class OpenBookQA(ICLMultiChoiceTaskDataset):
metric_type = "len_norm"

def __init__(
self, tokenizer, dataset_path="openbookqa", dataset_name="main", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="openbookqa",
dataset_name="main",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -642,13 +645,15 @@ class BoolQ(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="boolq", dataset_name=None, datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="boolq",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -690,13 +695,15 @@ class SciQ(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="sciq", dataset_name=None, datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="sciq",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -735,13 +742,15 @@ class ArcEasy(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="ai2_arc", dataset_name="ARC-Easy", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="ai2_arc",
dataset_name="ARC-Easy",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -777,13 +786,11 @@ def __init__(
tokenizer,
dataset_path="ai2_arc",
dataset_name="ARC-Challenge",
datasets_cache_dir: Optional[str] = None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)


Expand Down Expand Up @@ -819,13 +826,11 @@ def __init__(
tokenizer,
dataset_path="allenai/basic_arithmetic",
dataset_name=None,
datasets_cache_dir: Optional[str] = None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)


Expand All @@ -847,13 +852,11 @@ def __init__(
tokenizer,
dataset_path="tau/commonsense_qa",
dataset_name=None,
datasets_cache_dir: Optional[str] = None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)


Expand All @@ -870,13 +873,15 @@ class SocialIQa(ICLMultiChoiceTaskDataset):
metric_type = "len_norm"

def __init__(
self, tokenizer, dataset_path="social_i_qa", dataset_name=None, datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="social_i_qa",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -920,13 +925,15 @@ class COPA(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="super_glue", dataset_name="copa", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="super_glue",
dataset_name="copa",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -965,13 +972,15 @@ class RTE(ICLMultiChoiceTaskDataset):
metric_type = "len_norm"

def __init__(
self, tokenizer, dataset_path="glue", dataset_name="rte", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="glue",
dataset_name="rte",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -1007,13 +1016,15 @@ class CommitmentBank(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="super_glue", dataset_name="cb", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="super_glue",
dataset_name="cb",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -1047,13 +1058,15 @@ class MRPC(ICLMultiChoiceTaskDataset):
metric_type = "f1"

def __init__(
self, tokenizer, dataset_path="glue", dataset_name="mrpc", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="glue",
dataset_name="mrpc",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

@classmethod
Expand Down Expand Up @@ -1115,13 +1128,15 @@ class SST2(ICLMultiChoiceTaskDataset):
metric_type = "acc"

def __init__(
self, tokenizer, dataset_path="glue", dataset_name="sst2", datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="glue",
dataset_name="sst2",
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

@classmethod
Expand Down Expand Up @@ -1243,7 +1258,6 @@ def __init__(
split="validation",
prompt_variations=None,
mc_labels=False,
datasets_cache_dir: Optional[str] = None,
):
dataset_names = []
# Collect the relevant categories
Expand All @@ -1270,15 +1284,14 @@ def __init__(
raise ValueError(f"Unknown prompt variations: {prompt_variations}")
# Need to grab the dev set for the few-shot prompts
for name in dataset_names:
dev_set = load_hf_dataset(dataset_path, name, "dev", datasets_cache_dir)
dev_set = load_hf_dataset(dataset_path, name, "dev")
self.dev_set[name] = dev_set
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_names,
split=split,
prompts=prompts,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -1353,13 +1366,11 @@ def __init__(
tokenizer,
dataset_path="trivia_qa",
dataset_name="rc.wikipedia.nocontext",
datasets_cache_dir: Optional[str] = None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down Expand Up @@ -1388,13 +1399,15 @@ class NaturalQuestionsCELoss(ICLMultiChoiceTaskDataset):
metric_type = "ce_loss"

def __init__(
self, tokenizer, dataset_path="nq_open", dataset_name=None, datasets_cache_dir: Optional[str] = None
self,
tokenizer,
dataset_path="nq_open",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
datasets_cache_dir=datasets_cache_dir,
)

def doc_to_text(self, doc):
Expand Down
Loading
Loading