diff --git a/CHANGELOG.md b/CHANGELOG.md
index f23af2e02..d1b546191 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 shared memory implementation can be used by passing `use_legacy_shared_mem_impl` to `unshard.py`.
 - Refactor weight initialization. IMPORTANT: this does not maintain backwards-compatibility with older configs; the jobs will still run, but may produce different outputs.
 - Changed the behavior of the Lion optimizer to only record the update cosine similarity when `optimizer.record_update_metrics` is `True` in order to be consistent with the API.
+- Added HF datasets into `olmo_data`, and changed downstream eval to load from the package.
 
 ### Fixed
 
diff --git a/olmo/config.py b/olmo/config.py
index 3b49e3250..a0b09a21f 100644
--- a/olmo/config.py
+++ b/olmo/config.py
@@ -1176,6 +1176,8 @@ class TrainConfig(BaseConfig):
 
     hf_datasets_cache_dir: Optional[str] = None
     """
+    Deprecated, HF datasets are now stored in `olmo_data.hf_datasets`.
+
     Path to cache directory of HF datasets saved with `datasets.save_to_disk`.
     """
 
diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py
index b2d203012..bc8313c78 100644
--- a/olmo/eval/__init__.py
+++ b/olmo/eval/__init__.py
@@ -32,9 +32,7 @@ def build_downstream_evaluator(
     task_class = label_to_task_map[eval_cfg.label]
     if isinstance(task_class, tuple):
         task_class, task_kwargs = task_class
-    ds_eval_dataset = task_class(
-        tokenizer=tokenizer, datasets_cache_dir=train_config.hf_datasets_cache_dir, **task_kwargs
-    )  # type: ignore
+    ds_eval_dataset = task_class(tokenizer=tokenizer, **task_kwargs)  # type: ignore
     data_config = eval_cfg.data
     if is_unit_test:
         ds_eval_sampler = None
diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py
index d340d7786..a8da366e9 100644
--- a/olmo/eval/downstream.py
+++ b/olmo/eval/downstream.py
@@ -159,7 +159,6 @@ def __init__(
         tokenizer: Tokenizer,
         dataset_path: str,
         dataset_name: Union[str, Sequence[str], None] = None,
-        datasets_cache_dir: Optional[str] = None,
         model_ctx_len: int = 2048,
         split="validation",
         prompts=[None],  # List of prompt variants to use
@@ -183,7 +182,7 @@ def __init__(
 
         dataset_list = []
         for ds_name in dataset_names:
-            dataset = load_hf_dataset(self.dataset_path, ds_name, split, datasets_cache_dir)
+            dataset = load_hf_dataset(self.dataset_path, ds_name, split)
             dataset_list.append(dataset)
         self.dataset = datasets.concatenate_datasets(dataset_list)
 
@@ -400,13 +399,15 @@ class PIQA(ICLMultiChoiceTaskDataset):
     metric_type = "len_norm"
 
     def __init__(
-        self, tokenizer, dataset_path="piqa", dataset_name="plain_text", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="piqa",
+        dataset_name="plain_text",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -441,13 +442,15 @@ class HellaSwag(ICLMultiChoiceTaskDataset):
     metric_type = "len_norm"
 
     def __init__(
-        self, tokenizer, dataset_path="hellaswag", dataset_name=None, datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="hellaswag",
+        dataset_name=None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     @classmethod
@@ -506,14 +509,12 @@ def __init__(
         tokenizer,
         dataset_path="winogrande",
         dataset_name="winogrande_xl",
-        datasets_cache_dir: Optional[str] = None,
     ):
         # all winogrande datasets have same val set
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def prep_examples(self):
@@ -604,13 +605,15 @@ class OpenBookQA(ICLMultiChoiceTaskDataset):
     metric_type = "len_norm"
 
     def __init__(
-        self, tokenizer, dataset_path="openbookqa", dataset_name="main", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="openbookqa",
+        dataset_name="main",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -642,13 +645,15 @@ class BoolQ(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="boolq", dataset_name=None, datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="boolq",
+        dataset_name=None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -690,13 +695,15 @@ class SciQ(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="sciq", dataset_name=None, datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="sciq",
+        dataset_name=None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -735,13 +742,15 @@ class ArcEasy(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="ai2_arc", dataset_name="ARC-Easy", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="ai2_arc",
+        dataset_name="ARC-Easy",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -777,13 +786,11 @@ def __init__(
         tokenizer,
         dataset_path="ai2_arc",
         dataset_name="ARC-Challenge",
-        datasets_cache_dir: Optional[str] = None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
 
@@ -819,13 +826,11 @@ def __init__(
         tokenizer,
         dataset_path="allenai/basic_arithmetic",
         dataset_name=None,
-        datasets_cache_dir: Optional[str] = None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
 
@@ -847,13 +852,11 @@ def __init__(
         tokenizer,
         dataset_path="tau/commonsense_qa",
         dataset_name=None,
-        datasets_cache_dir: Optional[str] = None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
 
@@ -870,13 +873,15 @@ class SocialIQa(ICLMultiChoiceTaskDataset):
     metric_type = "len_norm"
 
     def __init__(
-        self, tokenizer, dataset_path="social_i_qa", dataset_name=None, datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="social_i_qa",
+        dataset_name=None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -920,13 +925,15 @@ class COPA(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="super_glue", dataset_name="copa", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="super_glue",
+        dataset_name="copa",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -965,13 +972,15 @@ class RTE(ICLMultiChoiceTaskDataset):
     metric_type = "len_norm"
 
     def __init__(
-        self, tokenizer, dataset_path="glue", dataset_name="rte", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="glue",
+        dataset_name="rte",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -1007,13 +1016,15 @@ class CommitmentBank(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="super_glue", dataset_name="cb", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="super_glue",
+        dataset_name="cb",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -1047,13 +1058,15 @@ class MRPC(ICLMultiChoiceTaskDataset):
     metric_type = "f1"
 
     def __init__(
-        self, tokenizer, dataset_path="glue", dataset_name="mrpc", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="glue",
+        dataset_name="mrpc",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     @classmethod
@@ -1115,13 +1128,15 @@ class SST2(ICLMultiChoiceTaskDataset):
     metric_type = "acc"
 
     def __init__(
-        self, tokenizer, dataset_path="glue", dataset_name="sst2", datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="glue",
+        dataset_name="sst2",
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     @classmethod
@@ -1243,7 +1258,6 @@ def __init__(
         split="validation",
         prompt_variations=None,
         mc_labels=False,
-        datasets_cache_dir: Optional[str] = None,
     ):
         dataset_names = []
         # Collect the relevant categories
@@ -1270,7 +1284,7 @@ def __init__(
                 raise ValueError(f"Unknown prompt variations: {prompt_variations}")
             # Need to grab the dev set for the few-shot prompts
             for name in dataset_names:
-                dev_set = load_hf_dataset(dataset_path, name, "dev", datasets_cache_dir)
+                dev_set = load_hf_dataset(dataset_path, name, "dev")
                 self.dev_set[name] = dev_set
         super().__init__(
             tokenizer=tokenizer,
@@ -1278,7 +1292,6 @@ def __init__(
             dataset_name=dataset_names,
             split=split,
             prompts=prompts,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -1353,13 +1366,11 @@ def __init__(
         tokenizer,
         dataset_path="trivia_qa",
         dataset_name="rc.wikipedia.nocontext",
-        datasets_cache_dir: Optional[str] = None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
@@ -1388,13 +1399,15 @@ class NaturalQuestionsCELoss(ICLMultiChoiceTaskDataset):
     metric_type = "ce_loss"
 
     def __init__(
-        self, tokenizer, dataset_path="nq_open", dataset_name=None, datasets_cache_dir: Optional[str] = None
+        self,
+        tokenizer,
+        dataset_path="nq_open",
+        dataset_name=None,
     ):
         super().__init__(
             tokenizer=tokenizer,
             dataset_path=dataset_path,
             dataset_name=dataset_name,
-            datasets_cache_dir=datasets_cache_dir,
         )
 
     def doc_to_text(self, doc):
diff --git a/olmo/util.py b/olmo/util.py
index 56bffd26a..c320b7a26 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -26,6 +26,8 @@
 from rich.text import Text
 from rich.traceback import Traceback
 
+from olmo_data.data import get_data_path
+
 from .aliases import PathOrStr
 from .exceptions import (
     OLMoCliError,
@@ -34,14 +36,7 @@
     OLMoNetworkError,
     OLMoThreadError,
 )
-from .torch_util import (
-    barrier,
-    get_fs_local_rank,
-    get_global_rank,
-    get_local_rank,
-    get_node_rank,
-    is_distributed,
-)
+from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
 
 try:
     from functools import cache
@@ -656,67 +651,44 @@ def _http_get_bytes_range(scheme: str, host_name: str, path: str, bytes_start: i
     return result
 
 
-def _load_hf_dataset_from_disk(hf_path: str, name: Optional[str], split: str, datasets_dir: str):
-    dataset_path = os.path.join(datasets_dir, hf_path, name or "none", split)
-    return datasets.load_from_disk(dataset_path)
-
-
-def _save_hf_dataset_to_disk(
+def save_hf_dataset_to_disk(
     dataset: datasets.DatasetDict | datasets.Dataset,
     hf_path: str,
     name: Optional[str],
     split: str,
-    datasets_dir: str,
+    datasets_dir: PathOrStr,
 ):
-    dataset_path = os.path.join(datasets_dir, hf_path, name or "none", split)
-    return dataset.save_to_disk(dataset_path)
+    """
+    Saves a HF dataset to disk under the `datasets_dir`. It can be used to add a HF dataset
+    to `olmo_data` as follows:
 
+    ```
+    import datasets
 
-def load_hf_dataset(path: str, name: Optional[str], split: str, datasets_cache_dir: Optional[str] = None):
-    dataset = None
+    from olmo.util import save_hf_dataset_to_disk
 
-    # First try to load dataset on only FS rank 0, to avoid unnecessary network load.
-    # This will hopefully cache the dataset for use in other FS ranks.
-    if get_fs_local_rank() == 0:
-        # Try get dataset from disk.
-        if datasets_cache_dir is not None:
-            try:
-                dataset = _load_hf_dataset_from_disk(path, name, split, datasets_cache_dir)
-            except FileNotFoundError:
-                log.info(
-                    "Path %s name %s split %s not present in local dir %s, loading from online",
-                    path,
-                    name,
-                    split,
-                    datasets_cache_dir,
-                )
+    path, name, split = ...
 
-        # Get dataset from online if not available on disk
-        if dataset is None:
-            dataset = datasets.load_dataset(
-                path=path,
-                name=name,
-                split=split,
-                trust_remote_code=True,
+    dataset = datasets.load_dataset(path, name=name, split=split)
+    save_hf_dataset_to_disk(dataset, path, name, split, "olmo_data/hf_datasets")
+    ```
+    """
+    dataset_path = Path(datasets_dir) / hf_path / (name or "none") / split
+    return dataset.save_to_disk(str(dataset_path))
+
+
+def load_hf_dataset(path: str, name: Optional[str], split: str):
+    """
+    Loads a HuggingFace dataset. The dataset is assumed to be saved using
+    `save_hf_dataset_to_disk` and located in `olmo_data/hf_datasets`.
+    """
+    dataset_rel_path = os.path.join("hf_datasets", path, name or "none", split)
+    with get_data_path(dataset_rel_path) as dataset_path:
+        if not dataset_path.is_dir():
+            raise NotADirectoryError(
+                f"HF dataset {path} name {name} split {split} not found in directory {dataset_rel_path}"
             )
-            assert isinstance(dataset, (datasets.DatasetDict, datasets.Dataset))
-            if datasets_cache_dir is not None:
-                _save_hf_dataset_to_disk(dataset, path, name, split, datasets_cache_dir)
-    barrier()
-
-    # Dataset is loaded in FS rank 0
-    if dataset is not None:
-        return dataset
-
-    # Load dataset on non-zero FS ranks
-    if datasets_cache_dir is not None:
-        return _load_hf_dataset_from_disk(path, name, split, datasets_cache_dir)
-    return datasets.load_dataset(
-        path=path,
-        name=name,
-        split=split,
-        trust_remote_code=True,
-    )
+        return datasets.load_from_disk(str(dataset_path))
 
 
 def default_thread_count() -> int:
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f6f429f77
Binary files /dev/null and b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json
new file mode 100644
index 000000000..50dbf3b97
--- /dev/null
+++ b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json
@@ -0,0 +1,79 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "ARC-Challenge",
+  "dataset_name": "ai2_arc",
+  "dataset_size": 821931,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Challenge/train-00000-of-00001.parquet": {
+      "num_bytes": 189909,
+      "checksum": null
+    },
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Challenge/test-00000-of-00001.parquet": {
+      "num_bytes": 203808,
+      "checksum": null
+    },
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Challenge/validation-00000-of-00001.parquet": {
+      "num_bytes": 55743,
+      "checksum": null
+    }
+  },
+  "download_size": 449460,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "text": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "label": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 1271391,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 349760,
+      "num_examples": 1119,
+      "dataset_name": "ai2_arc"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 375511,
+      "num_examples": 1172,
+      "dataset_name": "ai2_arc"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 96660,
+      "num_examples": 299,
+      "dataset_name": "ai2_arc"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json
new file mode 100644
index 000000000..9b1d2ba71
--- /dev/null
+++ b/olmo_data/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5336412f86bd5bc2",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e060f69d3
Binary files /dev/null and b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/dataset_info.json b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/dataset_info.json
new file mode 100644
index 000000000..5018cf4d8
--- /dev/null
+++ b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/dataset_info.json
@@ -0,0 +1,79 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "ARC-Easy",
+  "dataset_name": "ai2_arc",
+  "dataset_size": 1433908,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Easy/train-00000-of-00001.parquet": {
+      "num_bytes": 330598,
+      "checksum": null
+    },
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Easy/test-00000-of-00001.parquet": {
+      "num_bytes": 346257,
+      "checksum": null
+    },
+    "hf://datasets/ai2_arc@210d026faf9955653af8916fad021475a3f00453/ARC-Easy/validation-00000-of-00001.parquet": {
+      "num_bytes": 86080,
+      "checksum": null
+    }
+  },
+  "download_size": 762935,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "text": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "label": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 2196843,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 619000,
+      "num_examples": 2251,
+      "dataset_name": "ai2_arc"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 657514,
+      "num_examples": 2376,
+      "dataset_name": "ai2_arc"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 157394,
+      "num_examples": 570,
+      "dataset_name": "ai2_arc"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/state.json b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/state.json
new file mode 100644
index 000000000..d38e28802
--- /dev/null
+++ b/olmo_data/hf_datasets/ai2_arc/ARC-Easy/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e5c473da5a36fe31",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..973f2a774
Binary files /dev/null and b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/dataset_info.json b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/dataset_info.json
new file mode 100644
index 000000000..9853ee47f
--- /dev/null
+++ b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/dataset_info.json
@@ -0,0 +1,66 @@
+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "basic_arithmetic",
+  "dataset_size": 354087,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/allenai/basic_arithmetic@2d2dd39418ba88d2d4c8dbe6619050229ed4fe2f/validation.jsonl": {
+      "num_bytes": 564087,
+      "checksum": null
+    }
+  },
+  "download_size": 564087,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "text": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "label": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      }
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "type_tag": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 918174,
+  "splits": {
+    "validation": {
+      "name": "validation",
+      "num_bytes": 354087,
+      "num_examples": 3000,
+      "dataset_name": "basic_arithmetic"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/state.json b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/state.json
new file mode 100644
index 000000000..72d1efb7f
--- /dev/null
+++ b/olmo_data/hf_datasets/allenai/basic_arithmetic/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7d1f788fe546eab0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/boolq/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/boolq/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9b89d60cf
Binary files /dev/null and b/olmo_data/hf_datasets/boolq/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/boolq/none/validation/dataset_info.json b/olmo_data/hf_datasets/boolq/none/validation/dataset_info.json
new file mode 100644
index 000000000..22a3ca1cb
--- /dev/null
+++ b/olmo_data/hf_datasets/boolq/none/validation/dataset_info.json
@@ -0,0 +1,56 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "boolq",
+  "dataset_size": 7827766,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/boolq@35b264d03638db9f4ce671b711558bf7ff0f80d5/data/train-00000-of-00001.parquet": {
+      "num_bytes": 3685146,
+      "checksum": null
+    },
+    "hf://datasets/boolq@35b264d03638db9f4ce671b711558bf7ff0f80d5/data/validation-00000-of-00001.parquet": {
+      "num_bytes": 1257630,
+      "checksum": null
+    }
+  },
+  "download_size": 4942776,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "dtype": "bool",
+      "_type": "Value"
+    },
+    "passage": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 12770542,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 5829584,
+      "num_examples": 9427,
+      "dataset_name": "boolq"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 1998182,
+      "num_examples": 3270,
+      "dataset_name": "boolq"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/boolq/none/validation/state.json b/olmo_data/hf_datasets/boolq/none/validation/state.json
new file mode 100644
index 000000000..131ee5db0
--- /dev/null
+++ b/olmo_data/hf_datasets/boolq/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "479c1c569ebf02ea",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/mrpc/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/glue/mrpc/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..c9cedfdc9
Binary files /dev/null and b/olmo_data/hf_datasets/glue/mrpc/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/glue/mrpc/validation/dataset_info.json b/olmo_data/hf_datasets/glue/mrpc/validation/dataset_info.json
new file mode 100644
index 000000000..2a08de533
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/mrpc/validation/dataset_info.json
@@ -0,0 +1,73 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "mrpc",
+  "dataset_name": "glue",
+  "dataset_size": 1493584,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/mrpc/train-00000-of-00001.parquet": {
+      "num_bytes": 649281,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/mrpc/validation-00000-of-00001.parquet": {
+      "num_bytes": 75678,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/mrpc/test-00000-of-00001.parquet": {
+      "num_bytes": 308441,
+      "checksum": null
+    }
+  },
+  "download_size": 1033400,
+  "features": {
+    "sentence1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "sentence2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "not_equivalent",
+        "equivalent"
+      ],
+      "_type": "ClassLabel"
+    },
+    "idx": {
+      "dtype": "int32",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 2526984,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 944761,
+      "num_examples": 3668,
+      "dataset_name": "glue"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 105981,
+      "num_examples": 408,
+      "dataset_name": "glue"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 442842,
+      "num_examples": 1725,
+      "dataset_name": "glue"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/mrpc/validation/state.json b/olmo_data/hf_datasets/glue/mrpc/validation/state.json
new file mode 100644
index 000000000..1f12b74f5
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/mrpc/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d505079e8646ba79",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/rte/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/glue/rte/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a8c63242f
Binary files /dev/null and b/olmo_data/hf_datasets/glue/rte/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/glue/rte/validation/dataset_info.json b/olmo_data/hf_datasets/glue/rte/validation/dataset_info.json
new file mode 100644
index 000000000..004d774ed
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/rte/validation/dataset_info.json
@@ -0,0 +1,73 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "rte",
+  "dataset_name": "glue",
+  "dataset_size": 1913545,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/rte/train-00000-of-00001.parquet": {
+      "num_bytes": 583976,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/rte/validation-00000-of-00001.parquet": {
+      "num_bytes": 69020,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/rte/test-00000-of-00001.parquet": {
+      "num_bytes": 621413,
+      "checksum": null
+    }
+  },
+  "download_size": 1274409,
+  "features": {
+    "sentence1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "sentence2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "entailment",
+        "not_entailment"
+      ],
+      "_type": "ClassLabel"
+    },
+    "idx": {
+      "dtype": "int32",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 3187954,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 847944,
+      "num_examples": 2490,
+      "dataset_name": "glue"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 90798,
+      "num_examples": 277,
+      "dataset_name": "glue"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 974803,
+      "num_examples": 3000,
+      "dataset_name": "glue"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/rte/validation/state.json b/olmo_data/hf_datasets/glue/rte/validation/state.json
new file mode 100644
index 000000000..fbacb5ef8
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/rte/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "38f466bd379e51fb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/sst2/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/glue/sst2/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0c6a563b0
Binary files /dev/null and b/olmo_data/hf_datasets/glue/sst2/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/glue/sst2/validation/dataset_info.json b/olmo_data/hf_datasets/glue/sst2/validation/dataset_info.json
new file mode 100644
index 000000000..5e145145a
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/sst2/validation/dataset_info.json
@@ -0,0 +1,69 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "sst2",
+  "dataset_name": "glue",
+  "dataset_size": 5022007,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/sst2/train-00000-of-00001.parquet": {
+      "num_bytes": 3110468,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/sst2/validation-00000-of-00001.parquet": {
+      "num_bytes": 72819,
+      "checksum": null
+    },
+    "hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/sst2/test-00000-of-00001.parquet": {
+      "num_bytes": 147793,
+      "checksum": null
+    }
+  },
+  "download_size": 3331080,
+  "features": {
+    "sentence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "negative",
+        "positive"
+      ],
+      "_type": "ClassLabel"
+    },
+    "idx": {
+      "dtype": "int32",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 8353087,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 4698441,
+      "num_examples": 67349,
+      "dataset_name": "glue"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 106470,
+      "num_examples": 872,
+      "dataset_name": "glue"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 217096,
+      "num_examples": 1821,
+      "dataset_name": "glue"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/glue/sst2/validation/state.json b/olmo_data/hf_datasets/glue/sst2/validation/state.json
new file mode 100644
index 000000000..c4135cb46
--- /dev/null
+++ b/olmo_data/hf_datasets/glue/sst2/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8d45d63f787290f0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..2f7dc66fe
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/dataset_info.json
new file mode 100644
index 000000000..45e64a8cc
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "abstract_algebra",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24466,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209426,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21316,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2232,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 918,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/state.json
new file mode 100644
index 000000000..8085a89a6
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "20d155736fab0b1c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3ff75076f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/dataset_info.json
new file mode 100644
index 000000000..45e64a8cc
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "abstract_algebra",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24466,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209426,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21316,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2232,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 918,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/state.json
new file mode 100644
index 000000000..ba23793fb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "dca2c38b239e3cda",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..41e0b00dd
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/dataset_info.json
new file mode 100644
index 000000000..45e64a8cc
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "abstract_algebra",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24466,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209426,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21316,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2232,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 918,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/state.json
new file mode 100644
index 000000000..0f9157d87
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/abstract_algebra/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8c88bd434285a816",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..405c8b95b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/dataset_info.json
new file mode 100644
index 000000000..551e26d75
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "anatomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38886,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223846,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34594,
+      "num_examples": 135,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3282,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1010,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/state.json
new file mode 100644
index 000000000..53f3698f5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "17d827409c29fa1f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8c3445c0b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/dataset_info.json
new file mode 100644
index 000000000..551e26d75
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "anatomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38886,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223846,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34594,
+      "num_examples": 135,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3282,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1010,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/state.json
new file mode 100644
index 000000000..6aa49978d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8deee05ec40376c0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ec6a03ea5
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/dataset_info.json
new file mode 100644
index 000000000..551e26d75
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "anatomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38886,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223846,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34594,
+      "num_examples": 135,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3282,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1010,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/state.json
new file mode 100644
index 000000000..a7a21393c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/anatomy/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ee364dfdfd373b7a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..80b81780e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/dataset_info.json
new file mode 100644
index 000000000..0cabb53df
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "astronomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 56087,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166241047,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48735,
+      "num_examples": 152,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5223,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2129,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/state.json
new file mode 100644
index 000000000..439e0a563
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "00f55ab2bf4be215",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0b3975b02
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/dataset_info.json
new file mode 100644
index 000000000..0cabb53df
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "astronomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 56087,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166241047,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48735,
+      "num_examples": 152,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5223,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2129,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/state.json
new file mode 100644
index 000000000..edadd3685
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "281cb9bd08be8d90",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ace35ff02
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/dataset_info.json
new file mode 100644
index 000000000..0cabb53df
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "astronomy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 56087,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166241047,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48735,
+      "num_examples": 152,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5223,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2129,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/state.json
new file mode 100644
index 000000000..523591ed8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/astronomy/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "bd1f46d2378d0355",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..816144904
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/dataset_info.json
new file mode 100644
index 000000000..6e6dbeb19
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "business_ethics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 40648,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166225608,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35140,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3235,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2273,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/state.json
new file mode 100644
index 000000000..01a36421d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "20224bbf8789ffdc",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0da142c0d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/dataset_info.json
new file mode 100644
index 000000000..6e6dbeb19
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "business_ethics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 40648,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166225608,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35140,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3235,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2273,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/state.json
new file mode 100644
index 000000000..2e72fd661
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cbd085d70aa4bcb9",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a49fcd901
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/dataset_info.json
new file mode 100644
index 000000000..6e6dbeb19
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "business_ethics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 40648,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166225608,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35140,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3235,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2273,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/state.json
new file mode 100644
index 000000000..402166ff9
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/business_ethics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "dc36aa2960570bcf",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..582e92d24
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/dataset_info.json
new file mode 100644
index 000000000..6042fdda4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "clinical_knowledge",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77170,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262130,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68572,
+      "num_examples": 265,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7290,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1308,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/state.json
new file mode 100644
index 000000000..4eb77dc28
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e060acf395f8ad0b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..2180dde4a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/dataset_info.json
new file mode 100644
index 000000000..6042fdda4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "clinical_knowledge",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77170,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262130,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68572,
+      "num_examples": 265,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7290,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1308,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/state.json
new file mode 100644
index 000000000..b429876e5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "88ca8e31425354e1",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d3c65161e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/dataset_info.json
new file mode 100644
index 000000000..6042fdda4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "clinical_knowledge",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77170,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262130,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68572,
+      "num_examples": 265,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7290,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1308,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/state.json
new file mode 100644
index 000000000..caef45471
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/clinical_knowledge/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "9cf681398904b421",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8ad3c4e2a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/dataset_info.json
new file mode 100644
index 000000000..aac7bf2b5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 58247,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166243207,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51521,
+      "num_examples": 144,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5111,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1615,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/state.json
new file mode 100644
index 000000000..318d415c1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1aa3f8d419751552",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ab54af082
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/dataset_info.json
new file mode 100644
index 000000000..aac7bf2b5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 58247,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166243207,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51521,
+      "num_examples": 144,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5111,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1615,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/state.json
new file mode 100644
index 000000000..869089f51
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e4396961f6635a3a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..383e26c63
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/dataset_info.json
new file mode 100644
index 000000000..aac7bf2b5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 58247,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166243207,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51521,
+      "num_examples": 144,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5111,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1615,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/state.json
new file mode 100644
index 000000000..36432031a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_biology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "76a7259023a114a3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7c7aaaed5
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/dataset_info.json
new file mode 100644
index 000000000..1420d0b78
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 30704,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166215664,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26796,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2484,
+      "num_examples": 8,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1424,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/state.json
new file mode 100644
index 000000000..9fb7ccc3f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "0c78504a82387e31",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e25e396fa
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/dataset_info.json
new file mode 100644
index 000000000..1420d0b78
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 30704,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166215664,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26796,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2484,
+      "num_examples": 8,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1424,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/state.json
new file mode 100644
index 000000000..a46ac6344
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1983845a412bd613",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4ac55079e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/dataset_info.json
new file mode 100644
index 000000000..1420d0b78
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 30704,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166215664,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26796,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2484,
+      "num_examples": 8,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1424,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/state.json
new file mode 100644
index 000000000..9b9c90b72
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_chemistry/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c04908b072442d3d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..cf3836db9
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/dataset_info.json
new file mode 100644
index 000000000..2ee23d6cd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53281,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238241,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45429,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4959,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2893,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/state.json
new file mode 100644
index 000000000..7d0457f8c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a37cb419aa1dee47",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..812c92ab3
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/dataset_info.json
new file mode 100644
index 000000000..2ee23d6cd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53281,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238241,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45429,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4959,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2893,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/state.json
new file mode 100644
index 000000000..ed83d5e94
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "002abde3a5352321",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..80bc02cdc
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/dataset_info.json
new file mode 100644
index 000000000..2ee23d6cd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53281,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238241,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45429,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4959,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2893,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/state.json
new file mode 100644
index 000000000..3e4203fd1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_computer_science/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "948e41ce3a58bfdd",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5bab031bb
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/dataset_info.json
new file mode 100644
index 000000000..5fdd5f1d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 31504,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166216464,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26999,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2909,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1596,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/state.json
new file mode 100644
index 000000000..6dc1264aa
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "222fe21e229a3b00",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5e20ad18e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/dataset_info.json
new file mode 100644
index 000000000..5fdd5f1d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 31504,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166216464,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26999,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2909,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1596,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/state.json
new file mode 100644
index 000000000..56759c80f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "55305f09ac0c7084",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4c0fc77d0
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/dataset_info.json
new file mode 100644
index 000000000..5fdd5f1d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 31504,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166216464,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 26999,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2909,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1596,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/state.json
new file mode 100644
index 000000000..266f3aa69
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_mathematics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4de7c965c7cf800b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..2971f285d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/dataset_info.json
new file mode 100644
index 000000000..627ff0363
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95940,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280900,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 85845,
+      "num_examples": 173,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8337,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1758,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/state.json
new file mode 100644
index 000000000..584ee7c86
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b1126a770a5c857f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..dd3b4728b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/dataset_info.json
new file mode 100644
index 000000000..627ff0363
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95940,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280900,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 85845,
+      "num_examples": 173,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8337,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1758,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/state.json
new file mode 100644
index 000000000..2cdb5dfb5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c18eeec0949b5285",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..baeb8696d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/dataset_info.json
new file mode 100644
index 000000000..627ff0363
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95940,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280900,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 85845,
+      "num_examples": 173,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8337,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1758,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/state.json
new file mode 100644
index 000000000..53e57bca2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_medicine/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ad9e7753bff0ef4c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..412ac350e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/dataset_info.json
new file mode 100644
index 000000000..7666ee7a8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37289,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222249,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 32107,
+      "num_examples": 102,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3687,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1495,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/state.json
new file mode 100644
index 000000000..f53903cab
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "832fced8b6b15e40",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..6307d606f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/dataset_info.json
new file mode 100644
index 000000000..7666ee7a8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37289,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222249,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 32107,
+      "num_examples": 102,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3687,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1495,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/state.json
new file mode 100644
index 000000000..71f0b5322
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1339dc91a06c2cdb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..eb320fbeb
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/dataset_info.json
new file mode 100644
index 000000000..7666ee7a8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "college_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37289,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222249,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 32107,
+      "num_examples": 102,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3687,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1495,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/state.json
new file mode 100644
index 000000000..095832d79
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/college_physics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "59cdc11245c08838",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3631399a7
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/dataset_info.json
new file mode 100644
index 000000000..d69a642f7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "computer_security",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35174,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220134,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 29212,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4768,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1194,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/state.json
new file mode 100644
index 000000000..2be9053e5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4ff795eaafd7eecd",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b3c1b89b0
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/dataset_info.json
new file mode 100644
index 000000000..d69a642f7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "computer_security",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35174,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220134,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 29212,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4768,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1194,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/state.json
new file mode 100644
index 000000000..8917ff087
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "62fee97de2c29e76",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..dae1ae264
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/dataset_info.json
new file mode 100644
index 000000000..d69a642f7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "computer_security",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35174,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220134,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 29212,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4768,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1194,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/state.json
new file mode 100644
index 000000000..ce68af870
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/computer_security/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "47e3149326ffca1a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..58c2abd9f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/dataset_info.json
new file mode 100644
index 000000000..44995d30a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "conceptual_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 51933,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166236893,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45867,
+      "num_examples": 235,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5034,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1032,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/state.json
new file mode 100644
index 000000000..976e647c9
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b761ecd40d39a1ad",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f37dda1bc
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/dataset_info.json
new file mode 100644
index 000000000..44995d30a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "conceptual_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 51933,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166236893,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45867,
+      "num_examples": 235,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5034,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1032,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/state.json
new file mode 100644
index 000000000..596df67cc
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a3bda514c3d926ee",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3e624fa93
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/dataset_info.json
new file mode 100644
index 000000000..44995d30a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "conceptual_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 51933,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166236893,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 45867,
+      "num_examples": 235,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5034,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1032,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/state.json
new file mode 100644
index 000000000..2fab56d5b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/conceptual_physics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2f1b2a7b5d4287db",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..559b613cd
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/dataset_info.json
new file mode 100644
index 000000000..a06db14eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "econometrics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55218,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240178,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48359,
+      "num_examples": 114,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5147,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/state.json
new file mode 100644
index 000000000..3fe22c208
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "aa482dc1a13ee3ef",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..59e816624
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/dataset_info.json
new file mode 100644
index 000000000..a06db14eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "econometrics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55218,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240178,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48359,
+      "num_examples": 114,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5147,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/state.json
new file mode 100644
index 000000000..b03f97156
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a1d106cced570179",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..81611f2be
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/dataset_info.json
new file mode 100644
index 000000000..a06db14eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "econometrics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55218,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240178,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 48359,
+      "num_examples": 114,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5147,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/state.json
new file mode 100644
index 000000000..d8048c172
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/econometrics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c19f619be72337ae",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f05b62899
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/dataset_info.json
new file mode 100644
index 000000000..ca1aac906
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "electrical_engineering",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 33297,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166218257,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28900,
+      "num_examples": 145,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3307,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1090,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/state.json
new file mode 100644
index 000000000..059c785d2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "584ac4a847a12ab3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3d87bc7bd
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/dataset_info.json
new file mode 100644
index 000000000..ca1aac906
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "electrical_engineering",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 33297,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166218257,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28900,
+      "num_examples": 145,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3307,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1090,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/state.json
new file mode 100644
index 000000000..3b0bd713f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c03b70084f477eb9",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..15614dc47
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/dataset_info.json
new file mode 100644
index 000000000..ca1aac906
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "electrical_engineering",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 33297,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166218257,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28900,
+      "num_examples": 145,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3307,
+      "num_examples": 16,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1090,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/state.json
new file mode 100644
index 000000000..eb8a54b71
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/electrical_engineering/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1a49fa6022f56e82",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..fbea5f9b2
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/dataset_info.json
new file mode 100644
index 000000000..0b905cb3d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "elementary_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 91524,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166276484,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 79924,
+      "num_examples": 378,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10042,
+      "num_examples": 41,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1558,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/state.json
new file mode 100644
index 000000000..2172d97c7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1d7c3cb041ed7523",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e9ff174b4
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/dataset_info.json
new file mode 100644
index 000000000..0b905cb3d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "elementary_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 91524,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166276484,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 79924,
+      "num_examples": 378,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10042,
+      "num_examples": 41,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1558,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/state.json
new file mode 100644
index 000000000..ba52c9141
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "799f512b4ea42b80",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d63de83db
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/dataset_info.json
new file mode 100644
index 000000000..0b905cb3d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "elementary_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 91524,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166276484,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 79924,
+      "num_examples": 378,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10042,
+      "num_examples": 41,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1558,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/state.json
new file mode 100644
index 000000000..cb912a4ce
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/elementary_mathematics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a3acaaf5ad71135e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8a7ed0e1c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/dataset_info.json
new file mode 100644
index 000000000..98ea63fcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "formal_logic",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60078,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245038,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51789,
+      "num_examples": 126,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6464,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1825,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/state.json
new file mode 100644
index 000000000..fb2669be0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e5094a5c5c98f9bc",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..76bc5ceb9
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/dataset_info.json
new file mode 100644
index 000000000..98ea63fcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "formal_logic",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60078,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245038,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51789,
+      "num_examples": 126,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6464,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1825,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/state.json
new file mode 100644
index 000000000..edadc7131
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2447d143a40d1c18",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..235c29a2b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/dataset_info.json
new file mode 100644
index 000000000..98ea63fcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "formal_logic",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60078,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245038,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 51789,
+      "num_examples": 126,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6464,
+      "num_examples": 14,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1825,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/state.json
new file mode 100644
index 000000000..0066485d2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/formal_logic/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c65a8ac7d0915655",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..6883c5211
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/dataset_info.json
new file mode 100644
index 000000000..cf52e606c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "global_facts",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 23301,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166208261,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 19991,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2013,
+      "num_examples": 10,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1297,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/state.json
new file mode 100644
index 000000000..1170ab22c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8774d8c6d5d3b4ee",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..15a350209
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/dataset_info.json
new file mode 100644
index 000000000..cf52e606c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "global_facts",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 23301,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166208261,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 19991,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2013,
+      "num_examples": 10,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1297,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/state.json
new file mode 100644
index 000000000..ca66dac54
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "06fa75c0f85b61da",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..93a9ff9f4
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/dataset_info.json
new file mode 100644
index 000000000..cf52e606c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "global_facts",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 23301,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166208261,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 19991,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2013,
+      "num_examples": 10,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1297,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/state.json
new file mode 100644
index 000000000..eb7ba3cf8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/global_facts/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c716d4b511d690ff",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..31e3d0320
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/dataset_info.json
new file mode 100644
index 000000000..5a5e9410c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 130372,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166315332,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116850,
+      "num_examples": 310,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 11746,
+      "num_examples": 32,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1776,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/state.json
new file mode 100644
index 000000000..c31be6b00
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "0633c1d57fc000b7",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..91b61bfb8
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/dataset_info.json
new file mode 100644
index 000000000..5a5e9410c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 130372,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166315332,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116850,
+      "num_examples": 310,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 11746,
+      "num_examples": 32,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1776,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/state.json
new file mode 100644
index 000000000..7caae05d6
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cdb5912f765d19c2",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..00412c0a1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/dataset_info.json
new file mode 100644
index 000000000..5a5e9410c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_biology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 130372,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166315332,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116850,
+      "num_examples": 310,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 11746,
+      "num_examples": 32,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1776,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/state.json
new file mode 100644
index 000000000..7bdd248c5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_biology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ebfe4711fb7fabe7",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3fba6e9f8
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/dataset_info.json
new file mode 100644
index 000000000..d702c26e7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 72490,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166257450,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 63527,
+      "num_examples": 203,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7630,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1333,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/state.json
new file mode 100644
index 000000000..cc327084c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "19d61b0b1c456d15",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e36fdba24
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/dataset_info.json
new file mode 100644
index 000000000..d702c26e7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 72490,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166257450,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 63527,
+      "num_examples": 203,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7630,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1333,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/state.json
new file mode 100644
index 000000000..3eea5952b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b41be291d0efe099",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7087e1964
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/dataset_info.json
new file mode 100644
index 000000000..d702c26e7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_chemistry",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 72490,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166257450,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 63527,
+      "num_examples": 203,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7630,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1333,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/state.json
new file mode 100644
index 000000000..43552e110
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_chemistry/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f62f91b339bf9346",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e4ffcbf90
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/dataset_info.json
new file mode 100644
index 000000000..2934531d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 54349,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166239309,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 47664,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3619,
+      "num_examples": 9,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3066,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/state.json
new file mode 100644
index 000000000..61775baaf
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "236843273a34aee0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e19089ed8
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/dataset_info.json
new file mode 100644
index 000000000..2934531d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 54349,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166239309,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 47664,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3619,
+      "num_examples": 9,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3066,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/state.json
new file mode 100644
index 000000000..26a3ece30
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "aa1d78b34a77e5bb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9bc53a453
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/dataset_info.json
new file mode 100644
index 000000000..2934531d7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_computer_science",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 54349,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166239309,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 47664,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3619,
+      "num_examples": 9,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3066,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/state.json
new file mode 100644
index 000000000..233a948df
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_computer_science/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b5b58ee6995bb47d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5eca78e8a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/dataset_info.json
new file mode 100644
index 000000000..768a40237
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_european_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 317476,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166502436,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 275568,
+      "num_examples": 165,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30196,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 11712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/state.json
new file mode 100644
index 000000000..85987039a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1d994bb11ac3254f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..2b8014b91
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/dataset_info.json
new file mode 100644
index 000000000..768a40237
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_european_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 317476,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166502436,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 275568,
+      "num_examples": 165,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30196,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 11712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/state.json
new file mode 100644
index 000000000..c9475c0c2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "480449749cb2b082",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b323089b1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/dataset_info.json
new file mode 100644
index 000000000..768a40237
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_european_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 317476,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166502436,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 275568,
+      "num_examples": 165,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30196,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 11712,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/state.json
new file mode 100644
index 000000000..713aca008
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_european_history/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e56fc4122593f632",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7dc14abb0
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/dataset_info.json
new file mode 100644
index 000000000..c98f94b80
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_geography",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53358,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238318,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 46972,
+      "num_examples": 198,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4870,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1516,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/state.json
new file mode 100644
index 000000000..a713b1068
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "20511542e3283886",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7a6f55f13
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/dataset_info.json
new file mode 100644
index 000000000..c98f94b80
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_geography",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53358,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238318,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 46972,
+      "num_examples": 198,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4870,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1516,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/state.json
new file mode 100644
index 000000000..7c6594208
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d1be97d315bdf95d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..c94be6db5
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/dataset_info.json
new file mode 100644
index 000000000..c98f94b80
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_geography",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 53358,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166238318,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 46972,
+      "num_examples": 198,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4870,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1516,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/state.json
new file mode 100644
index 000000000..05b1ae467
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_geography/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f77ad34af6fc51f3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5a01dbdc3
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/dataset_info.json
new file mode 100644
index 000000000..92f08e2a1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_government_and_politics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 83421,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166268381,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 73589,
+      "num_examples": 193,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7870,
+      "num_examples": 21,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1962,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/state.json
new file mode 100644
index 000000000..0a4b94f05
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c1659513735417f2",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8f9b38c11
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/dataset_info.json
new file mode 100644
index 000000000..92f08e2a1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_government_and_politics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 83421,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166268381,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 73589,
+      "num_examples": 193,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7870,
+      "num_examples": 21,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1962,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/state.json
new file mode 100644
index 000000000..9e18b30c5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "abfcd2ea63940fc5",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b4ac2379b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/dataset_info.json
new file mode 100644
index 000000000..92f08e2a1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_government_and_politics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 83421,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166268381,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 73589,
+      "num_examples": 193,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7870,
+      "num_examples": 21,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1962,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/state.json
new file mode 100644
index 000000000..0eab5b412
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_government_and_politics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c09f4d859db6fa3e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e24d7f68e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/dataset_info.json
new file mode 100644
index 000000000..350570fd7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_macroeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 145139,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166330099,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 129375,
+      "num_examples": 390,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 14298,
+      "num_examples": 43,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1466,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/state.json
new file mode 100644
index 000000000..667e0900b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a3be1fcf374ada6a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f61e820b3
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/dataset_info.json
new file mode 100644
index 000000000..350570fd7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_macroeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 145139,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166330099,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 129375,
+      "num_examples": 390,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 14298,
+      "num_examples": 43,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1466,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/state.json
new file mode 100644
index 000000000..560b0dac6
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5171a01e77504546",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..92d0f89f1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/dataset_info.json
new file mode 100644
index 000000000..350570fd7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_macroeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 145139,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166330099,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 129375,
+      "num_examples": 390,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 14298,
+      "num_examples": 43,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1466,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/state.json
new file mode 100644
index 000000000..78db3b711
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_macroeconomics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1fc263055941b33f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a1ab8e8e7
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/dataset_info.json
new file mode 100644
index 000000000..171328c5c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 70088,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166255048,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62132,
+      "num_examples": 270,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6536,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1420,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/state.json
new file mode 100644
index 000000000..cfbbb0028
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fdba997591f7572a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..fb117459e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/dataset_info.json
new file mode 100644
index 000000000..171328c5c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 70088,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166255048,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62132,
+      "num_examples": 270,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6536,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1420,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/state.json
new file mode 100644
index 000000000..84fb747ad
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f6d1438629210208",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..fa1784f62
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/dataset_info.json
new file mode 100644
index 000000000..171328c5c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_mathematics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 70088,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166255048,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62132,
+      "num_examples": 270,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6536,
+      "num_examples": 29,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1420,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/state.json
new file mode 100644
index 000000000..37f38b85e
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_mathematics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e5186d0211426a05",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..cedefc667
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/dataset_info.json
new file mode 100644
index 000000000..215cbafcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_microeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 92588,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166277548,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 82831,
+      "num_examples": 238,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8321,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1436,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/state.json
new file mode 100644
index 000000000..5b1e443e4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "51a244e59f751242",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0e539903c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/dataset_info.json
new file mode 100644
index 000000000..215cbafcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_microeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 92588,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166277548,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 82831,
+      "num_examples": 238,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8321,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1436,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/state.json
new file mode 100644
index 000000000..e1f3f56e0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4cd44cc8b1d07e02",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..90f758278
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/dataset_info.json
new file mode 100644
index 000000000..215cbafcd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_microeconomics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 92588,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166277548,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 82831,
+      "num_examples": 238,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8321,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1436,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/state.json
new file mode 100644
index 000000000..8906ef1d1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_microeconomics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "05bdabc98bdfc467",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..23610971c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/dataset_info.json
new file mode 100644
index 000000000..ab002da50
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 71741,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166256701,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62999,
+      "num_examples": 151,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7150,
+      "num_examples": 17,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1592,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/state.json
new file mode 100644
index 000000000..7f258e68c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f6283c98f76a7943",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9db9e55a3
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/dataset_info.json
new file mode 100644
index 000000000..ab002da50
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 71741,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166256701,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62999,
+      "num_examples": 151,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7150,
+      "num_examples": 17,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1592,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/state.json
new file mode 100644
index 000000000..07fd4336e
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "252b40f4b19b762a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3b6e71725
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/dataset_info.json
new file mode 100644
index 000000000..ab002da50
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_physics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 71741,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166256701,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 62999,
+      "num_examples": 151,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7150,
+      "num_examples": 17,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1592,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/state.json
new file mode 100644
index 000000000..b4cb622b2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_physics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "98d630de2600c9cf",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..6567a6b23
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/dataset_info.json
new file mode 100644
index 000000000..d97309735
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 194405,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166379365,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 173565,
+      "num_examples": 545,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 18817,
+      "num_examples": 60,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2023,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/state.json
new file mode 100644
index 000000000..1ce26eac9
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "338f7d249a4397fb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ca6fd18b1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/dataset_info.json
new file mode 100644
index 000000000..d97309735
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 194405,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166379365,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 173565,
+      "num_examples": 545,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 18817,
+      "num_examples": 60,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2023,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/state.json
new file mode 100644
index 000000000..0a3f639c5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7161e40d2b654853",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e1b1051e2
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/dataset_info.json
new file mode 100644
index 000000000..d97309735
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 194405,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166379365,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 173565,
+      "num_examples": 545,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 18817,
+      "num_examples": 60,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2023,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/state.json
new file mode 100644
index 000000000..2ac1ea399
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_psychology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a4e5c8c8c774ae30",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..fab795535
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/dataset_info.json
new file mode 100644
index 000000000..d9d70e92b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_statistics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 129535,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166314495,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116306,
+      "num_examples": 216,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10583,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2646,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/state.json
new file mode 100644
index 000000000..dabb89741
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cccb6ff5e82160fa",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f79458047
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/dataset_info.json
new file mode 100644
index 000000000..d9d70e92b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_statistics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 129535,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166314495,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116306,
+      "num_examples": 216,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10583,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2646,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/state.json
new file mode 100644
index 000000000..39c1a45a2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a94dcfe93aa4586f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5133f35c1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/dataset_info.json
new file mode 100644
index 000000000..d9d70e92b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_statistics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 129535,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166314495,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 116306,
+      "num_examples": 216,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10583,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2646,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/state.json
new file mode 100644
index 000000000..fb16a1fdf
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_statistics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "9ac32f17f19685b8",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..612408167
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/dataset_info.json
new file mode 100644
index 000000000..48971b97a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_us_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 343274,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166528234,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 302026,
+      "num_examples": 204,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 32266,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 8982,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/state.json
new file mode 100644
index 000000000..79e238264
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e85379a01e115a76",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4773eda8b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/dataset_info.json
new file mode 100644
index 000000000..48971b97a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_us_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 343274,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166528234,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 302026,
+      "num_examples": 204,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 32266,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 8982,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/state.json
new file mode 100644
index 000000000..6d8f6cc32
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "43ddac66f666fa05",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..db0359634
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/dataset_info.json
new file mode 100644
index 000000000..48971b97a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_us_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 343274,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166528234,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 302026,
+      "num_examples": 204,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 32266,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 8982,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/state.json
new file mode 100644
index 000000000..f56033390
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_us_history/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "59df1cbcbcd89698",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9a85df20f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/dataset_info.json
new file mode 100644
index 000000000..0339f4bd8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_world_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 436736,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166621696,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 385478,
+      "num_examples": 237,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 46243,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5015,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/state.json
new file mode 100644
index 000000000..c3dac9d39
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2a41b781a0f61fd9",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a1288c0fa
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/dataset_info.json
new file mode 100644
index 000000000..0339f4bd8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_world_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 436736,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166621696,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 385478,
+      "num_examples": 237,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 46243,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5015,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/state.json
new file mode 100644
index 000000000..7da835b2c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5aec070596710b81",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..52f9abf56
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/dataset_info.json
new file mode 100644
index 000000000..0339f4bd8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "high_school_world_history",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 436736,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166621696,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 385478,
+      "num_examples": 237,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 46243,
+      "num_examples": 26,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5015,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/state.json
new file mode 100644
index 000000000..3558e96d8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/high_school_world_history/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e3c9582fbfb93d8e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e9b338037
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/dataset_info.json
new file mode 100644
index 000000000..857737c5d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_aging",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55542,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240502,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 49431,
+      "num_examples": 223,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5040,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1071,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/state.json
new file mode 100644
index 000000000..508c9bed9
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ba7bcdadc5b28a0d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..20559f533
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/dataset_info.json
new file mode 100644
index 000000000..857737c5d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_aging",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55542,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240502,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 49431,
+      "num_examples": 223,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5040,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1071,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/state.json
new file mode 100644
index 000000000..8c59c5ca7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "324ed4c42296e339",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..466bca518
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/dataset_info.json
new file mode 100644
index 000000000..857737c5d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_aging",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 55542,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166240502,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 49431,
+      "num_examples": 223,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5040,
+      "num_examples": 23,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1071,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/state.json
new file mode 100644
index 000000000..1cf186690
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_aging/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "80bccb979e65c0fa",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..32aa21f67
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/dataset_info.json
new file mode 100644
index 000000000..43345a6f0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_sexuality",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38384,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223344,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34587,
+      "num_examples": 131,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2637,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1160,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/state.json
new file mode 100644
index 000000000..cafa4394f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "157b386122007fa0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..45df6c57d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/dataset_info.json
new file mode 100644
index 000000000..43345a6f0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_sexuality",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38384,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223344,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34587,
+      "num_examples": 131,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2637,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1160,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/state.json
new file mode 100644
index 000000000..aad6787b8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2b3f4eb92b47e593",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b7917fa34
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/dataset_info.json
new file mode 100644
index 000000000..43345a6f0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "human_sexuality",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 38384,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166223344,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 34587,
+      "num_examples": 131,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 2637,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1160,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/state.json
new file mode 100644
index 000000000..189fcc2b1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/human_sexuality/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a4c606a4dc176d44",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3bcb9eb04
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/dataset_info.json
new file mode 100644
index 000000000..24e82046f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "international_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 65305,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166250265,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 56060,
+      "num_examples": 121,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6734,
+      "num_examples": 13,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2511,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/state.json
new file mode 100644
index 000000000..b0739923e
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c71c68ae6d322ecd",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..665fe0e7c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/dataset_info.json
new file mode 100644
index 000000000..24e82046f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "international_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 65305,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166250265,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 56060,
+      "num_examples": 121,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6734,
+      "num_examples": 13,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2511,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/state.json
new file mode 100644
index 000000000..e9ab58ec3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a2c2cba37acd51da",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4936e4195
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/dataset_info.json
new file mode 100644
index 000000000..24e82046f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "international_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 65305,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166250265,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 56060,
+      "num_examples": 121,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6734,
+      "num_examples": 13,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2511,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/state.json
new file mode 100644
index 000000000..eb8890df4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/international_law/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "be245d1755c42498",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0ecc8da56
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/dataset_info.json
new file mode 100644
index 000000000..77be19d1b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "jurisprudence",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41090,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226050,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35810,
+      "num_examples": 108,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3904,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1376,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/state.json
new file mode 100644
index 000000000..e3c0cacc4
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "be97b2b406b8981c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5c5bc10f0
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/dataset_info.json
new file mode 100644
index 000000000..77be19d1b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "jurisprudence",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41090,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226050,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35810,
+      "num_examples": 108,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3904,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1376,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/state.json
new file mode 100644
index 000000000..dd855ce57
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "71bc643a4a388f69",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..da204b09e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/dataset_info.json
new file mode 100644
index 000000000..77be19d1b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "jurisprudence",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41090,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226050,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 35810,
+      "num_examples": 108,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3904,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1376,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/state.json
new file mode 100644
index 000000000..02959ac80
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/jurisprudence/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "27b2d03775694505",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d36f1afa5
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/dataset_info.json
new file mode 100644
index 000000000..dd6768532
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "logical_fallacies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60663,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245623,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 53528,
+      "num_examples": 163,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5469,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/state.json
new file mode 100644
index 000000000..578c7b456
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a5f621079a9e5be8",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7565f7be8
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/dataset_info.json
new file mode 100644
index 000000000..dd6768532
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "logical_fallacies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60663,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245623,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 53528,
+      "num_examples": 163,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5469,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/state.json
new file mode 100644
index 000000000..4b1c96094
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5ee5a18bf21e4bd3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e59f2333f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/dataset_info.json
new file mode 100644
index 000000000..dd6768532
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "logical_fallacies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 60663,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166245623,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 53528,
+      "num_examples": 163,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5469,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/state.json
new file mode 100644
index 000000000..e02855fb2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/logical_fallacies/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7c7cf856e53aac3c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..451dad5e5
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/dataset_info.json
new file mode 100644
index 000000000..0226f8a42
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "machine_learning",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 36108,
+      "num_examples": 112,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3440,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2411,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/state.json
new file mode 100644
index 000000000..d41a9b386
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b196e16b4e56f533",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..06d9d2ccd
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/dataset_info.json
new file mode 100644
index 000000000..0226f8a42
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "machine_learning",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 36108,
+      "num_examples": 112,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3440,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2411,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/state.json
new file mode 100644
index 000000000..99eada0d1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a6b3032f1a46a701",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..339b60d40
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/dataset_info.json
new file mode 100644
index 000000000..0226f8a42
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "machine_learning",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 41959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166226919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 36108,
+      "num_examples": 112,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3440,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2411,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/state.json
new file mode 100644
index 000000000..2a79d8335
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/machine_learning/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "caa69f7c960d1451",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a788242fe
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/dataset_info.json
new file mode 100644
index 000000000..8547a82bd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "management",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24350,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209310,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21432,
+      "num_examples": 103,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 1962,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 956,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/state.json
new file mode 100644
index 000000000..64f99ffbb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "0e43db3884e9aff1",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..09f42c26d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/dataset_info.json
new file mode 100644
index 000000000..8547a82bd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "management",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24350,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209310,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21432,
+      "num_examples": 103,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 1962,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 956,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/state.json
new file mode 100644
index 000000000..0d134791e
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "633a2ba199495783",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..7b08f88c4
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/dataset_info.json
new file mode 100644
index 000000000..8547a82bd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "management",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 24350,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166209310,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 21432,
+      "num_examples": 103,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 1962,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 956,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/state.json
new file mode 100644
index 000000000..1759a190d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/management/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f18577dd2efaa2d2",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..bc1293fa8
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/dataset_info.json
new file mode 100644
index 000000000..154c7070c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "marketing",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 75296,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166260256,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 66055,
+      "num_examples": 234,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7707,
+      "num_examples": 25,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1534,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/state.json
new file mode 100644
index 000000000..75cfa6cf3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b2d8494f8ed11222",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..06e1affd6
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/dataset_info.json
new file mode 100644
index 000000000..154c7070c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "marketing",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 75296,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166260256,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 66055,
+      "num_examples": 234,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7707,
+      "num_examples": 25,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1534,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/state.json
new file mode 100644
index 000000000..0fa23559d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f00b001eb0cfd167",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..cc5ce153c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/dataset_info.json
new file mode 100644
index 000000000..154c7070c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "marketing",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 75296,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166260256,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 66055,
+      "num_examples": 234,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7707,
+      "num_examples": 25,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1534,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/state.json
new file mode 100644
index 000000000..e82964386
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/marketing/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "bde873ad8a86829e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..1c2fab4a7
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/dataset_info.json
new file mode 100644
index 000000000..e6235110f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "medical_genetics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 27242,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166212202,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 22852,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3213,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1177,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/state.json
new file mode 100644
index 000000000..3c3acc041
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d0d851bff1fdfe0b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d9dbc5e37
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/dataset_info.json
new file mode 100644
index 000000000..e6235110f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "medical_genetics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 27242,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166212202,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 22852,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3213,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1177,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/state.json
new file mode 100644
index 000000000..727f856b3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f2a4fa595b2dcd1f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..906ecad29
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/dataset_info.json
new file mode 100644
index 000000000..e6235110f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "medical_genetics",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 27242,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166212202,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 22852,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3213,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1177,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/state.json
new file mode 100644
index 000000000..e04887206
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/medical_genetics/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "af71610776e81294",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f04e72720
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/dataset_info.json
new file mode 100644
index 000000000..dedba9719
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "miscellaneous",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 177555,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166362515,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 161003,
+      "num_examples": 783,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15780,
+      "num_examples": 86,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 772,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/state.json
new file mode 100644
index 000000000..66a12c573
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1d5ccb1485700225",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..10b24cea7
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/dataset_info.json
new file mode 100644
index 000000000..dedba9719
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "miscellaneous",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 177555,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166362515,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 161003,
+      "num_examples": 783,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15780,
+      "num_examples": 86,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 772,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/state.json
new file mode 100644
index 000000000..0a5fe8e4b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f9de05202963fc75",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f0b81b687
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/dataset_info.json
new file mode 100644
index 000000000..dedba9719
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "miscellaneous",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 177555,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166362515,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 161003,
+      "num_examples": 783,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15780,
+      "num_examples": 86,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 772,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/state.json
new file mode 100644
index 000000000..f0204fbad
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/miscellaneous/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "71a067f0b1b293b9",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b385de016
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/dataset_info.json
new file mode 100644
index 000000000..08270f123
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_disputes",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 128959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166313919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 114034,
+      "num_examples": 346,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 13092,
+      "num_examples": 38,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1833,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/state.json
new file mode 100644
index 000000000..94286b3bd
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "05f8d323418290df",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..642c29859
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/dataset_info.json
new file mode 100644
index 000000000..08270f123
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_disputes",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 128959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166313919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 114034,
+      "num_examples": 346,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 13092,
+      "num_examples": 38,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1833,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/state.json
new file mode 100644
index 000000000..e57275108
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "74135db3d6d37f16",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..e0593ec41
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/dataset_info.json
new file mode 100644
index 000000000..08270f123
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_disputes",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 128959,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166313919,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 114034,
+      "num_examples": 346,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 13092,
+      "num_examples": 38,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1833,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/state.json
new file mode 100644
index 000000000..47013fcc2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_disputes/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b19652aa9e4aa075",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..011a26242
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/dataset_info.json
new file mode 100644
index 000000000..89192f276
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_scenarios",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 437386,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166622346,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 391019,
+      "num_examples": 895,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 44226,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2141,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/state.json
new file mode 100644
index 000000000..3ed26a3a7
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ee4a7fb876009da2",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..13042e4cd
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/dataset_info.json
new file mode 100644
index 000000000..89192f276
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_scenarios",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 437386,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166622346,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 391019,
+      "num_examples": 895,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 44226,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2141,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/state.json
new file mode 100644
index 000000000..fb32e8756
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e6c87ce7884335d3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4d95fea39
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/dataset_info.json
new file mode 100644
index 000000000..89192f276
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "moral_scenarios",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 437386,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166622346,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 391019,
+      "num_examples": 895,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 44226,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2141,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/state.json
new file mode 100644
index 000000000..bf5baf654
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/moral_scenarios/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7248069621edcad0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..1781c5be7
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/dataset_info.json
new file mode 100644
index 000000000..7a952ea6a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "nutrition",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 107367,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166292327,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 96376,
+      "num_examples": 306,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8853,
+      "num_examples": 33,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2138,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/state.json
new file mode 100644
index 000000000..898344ead
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2a885d2d82ab7d56",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8024e87ff
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/dataset_info.json
new file mode 100644
index 000000000..7a952ea6a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "nutrition",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 107367,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166292327,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 96376,
+      "num_examples": 306,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8853,
+      "num_examples": 33,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2138,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/state.json
new file mode 100644
index 000000000..953d91866
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e8cb6b6ae4881443",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..07fb20433
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/dataset_info.json
new file mode 100644
index 000000000..7a952ea6a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "nutrition",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 107367,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166292327,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 96376,
+      "num_examples": 306,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 8853,
+      "num_examples": 33,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2138,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/state.json
new file mode 100644
index 000000000..9fb755c16
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/nutrition/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1815496614ea9299",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..47fd518af
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/dataset_info.json
new file mode 100644
index 000000000..f913fe885
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "philosophy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95109,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280069,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 84415,
+      "num_examples": 311,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 9648,
+      "num_examples": 34,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1046,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/state.json
new file mode 100644
index 000000000..92e00cf4b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "189278999a273b54",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ad614a54b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/dataset_info.json
new file mode 100644
index 000000000..f913fe885
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "philosophy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95109,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280069,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 84415,
+      "num_examples": 311,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 9648,
+      "num_examples": 34,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1046,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/state.json
new file mode 100644
index 000000000..3c66583d3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7483bf4f3ab0737c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..69a29d539
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/dataset_info.json
new file mode 100644
index 000000000..f913fe885
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "philosophy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 95109,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166280069,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 84415,
+      "num_examples": 311,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 9648,
+      "num_examples": 34,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1046,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/state.json
new file mode 100644
index 000000000..5d3cf0088
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/philosophy/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "108a3a0b6531be26",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a6118f7ab
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/dataset_info.json
new file mode 100644
index 000000000..e0a293964
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "prehistory",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 106817,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166291777,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 94118,
+      "num_examples": 324,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10763,
+      "num_examples": 35,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1936,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/state.json
new file mode 100644
index 000000000..3814972ad
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4b2d5e67642ce059",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9019d8b53
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/dataset_info.json
new file mode 100644
index 000000000..e0a293964
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "prehistory",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 106817,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166291777,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 94118,
+      "num_examples": 324,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10763,
+      "num_examples": 35,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1936,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/state.json
new file mode 100644
index 000000000..c43863570
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b772f2a98fbdbb69",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f6f82463f
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/dataset_info.json
new file mode 100644
index 000000000..e0a293964
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "prehistory",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 106817,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166291777,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 94118,
+      "num_examples": 324,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 10763,
+      "num_examples": 35,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1936,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/state.json
new file mode 100644
index 000000000..00bba37c6
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/prehistory/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "81331e81e5b40812",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..529774d75
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/dataset_info.json
new file mode 100644
index 000000000..a522ef00d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_accounting",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 149620,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166334580,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 132152,
+      "num_examples": 282,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15197,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2271,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/state.json
new file mode 100644
index 000000000..772e95de3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7a37bb73c230072b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..fd46fae7a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/dataset_info.json
new file mode 100644
index 000000000..a522ef00d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_accounting",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 149620,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166334580,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 132152,
+      "num_examples": 282,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15197,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2271,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/state.json
new file mode 100644
index 000000000..a99dbd38a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "60bce76fbb51330e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..a16928a68
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/dataset_info.json
new file mode 100644
index 000000000..a522ef00d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_accounting",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 149620,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166334580,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 132152,
+      "num_examples": 282,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 15197,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2271,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/state.json
new file mode 100644
index 000000000..473c99862
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_accounting/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "49887e665a8c5ed4",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..dcb2da04e
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/dataset_info.json
new file mode 100644
index 000000000..19542c4b3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 2136035,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 168320995,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 1922430,
+      "num_examples": 1534,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 206907,
+      "num_examples": 170,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 6698,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/state.json
new file mode 100644
index 000000000..bb8a97a21
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "71dee5b440d08bac",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..c07a89655
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/dataset_info.json
new file mode 100644
index 000000000..19542c4b3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 2136035,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 168320995,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 1922430,
+      "num_examples": 1534,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 206907,
+      "num_examples": 170,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 6698,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/state.json
new file mode 100644
index 000000000..7a0fb0c9a
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "3258b12090812d7c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..18d4eccb4
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/dataset_info.json
new file mode 100644
index 000000000..19542c4b3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_law",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 2136035,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 168320995,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 1922430,
+      "num_examples": 1534,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 206907,
+      "num_examples": 170,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 6698,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/state.json
new file mode 100644
index 000000000..98df05f65
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_law/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "26cd1d1c01ea63c8",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b1dc86239
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/dataset_info.json
new file mode 100644
index 000000000..5414e0c55
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 252879,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166437839,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 224349,
+      "num_examples": 272,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 24610,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3920,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/state.json
new file mode 100644
index 000000000..ad0e06997
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "872fb834fa470cf8",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..992fd2e28
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/dataset_info.json
new file mode 100644
index 000000000..5414e0c55
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 252879,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166437839,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 224349,
+      "num_examples": 272,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 24610,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3920,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/state.json
new file mode 100644
index 000000000..edcc94714
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d635efcb40b3db20",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..05d43c4a2
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/dataset_info.json
new file mode 100644
index 000000000..5414e0c55
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_medicine",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 252879,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166437839,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 224349,
+      "num_examples": 272,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 24610,
+      "num_examples": 31,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 3920,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/state.json
new file mode 100644
index 000000000..62e8d7188
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_medicine/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "800738ede9c5344a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d017b5abb
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/dataset_info.json
new file mode 100644
index 000000000..5aa9b280d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 275753,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166460713,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 242411,
+      "num_examples": 612,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30952,
+      "num_examples": 69,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2390,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/state.json
new file mode 100644
index 000000000..9ed325f8f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7956992fec89d96a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..38f98a036
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/dataset_info.json
new file mode 100644
index 000000000..5aa9b280d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 275753,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166460713,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 242411,
+      "num_examples": 612,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30952,
+      "num_examples": 69,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2390,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/state.json
new file mode 100644
index 000000000..2c0c4b0ce
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1fe2948ecfe8efbe",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..8f335d550
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/dataset_info.json
new file mode 100644
index 000000000..5aa9b280d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "professional_psychology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 275753,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166460713,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 242411,
+      "num_examples": 612,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 30952,
+      "num_examples": 69,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 2390,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/state.json
new file mode 100644
index 000000000..f1799fba2
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/professional_psychology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4c1c6203ffd8e681",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b453b7da6
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/dataset_info.json
new file mode 100644
index 000000000..52455b558
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "public_relations",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37326,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222286,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30948,
+      "num_examples": 110,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4794,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1584,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/state.json
new file mode 100644
index 000000000..bceb74d58
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8ea2239c27f9682a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..3c76c47b1
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/dataset_info.json
new file mode 100644
index 000000000..52455b558
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "public_relations",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37326,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222286,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30948,
+      "num_examples": 110,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4794,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1584,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/state.json
new file mode 100644
index 000000000..07f4ddb6d
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a4029ee86f9a4d47",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f9e5cd739
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/dataset_info.json
new file mode 100644
index 000000000..52455b558
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "public_relations",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 37326,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166222286,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30948,
+      "num_examples": 110,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 4794,
+      "num_examples": 12,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1584,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/state.json
new file mode 100644
index 000000000..9a465c79c
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/public_relations/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "04ecaee7c26d1413",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..244a3773b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/dataset_info.json
new file mode 100644
index 000000000..df7be81c1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "security_studies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 238320,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166423280,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 209732,
+      "num_examples": 245,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 23165,
+      "num_examples": 27,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5423,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/state.json
new file mode 100644
index 000000000..889fec986
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8b9eda9017d8114c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..037e87ea9
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/dataset_info.json
new file mode 100644
index 000000000..df7be81c1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "security_studies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 238320,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166423280,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 209732,
+      "num_examples": 245,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 23165,
+      "num_examples": 27,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5423,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/state.json
new file mode 100644
index 000000000..72c356331
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f1c9d865f880ba6f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5c62c3ee9
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/dataset_info.json
new file mode 100644
index 000000000..df7be81c1
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "security_studies",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 238320,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166423280,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 209732,
+      "num_examples": 245,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 23165,
+      "num_examples": 27,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 5423,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/state.json
new file mode 100644
index 000000000..1379ab938
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/security_studies/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8488581c295e3561",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..14218401d
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/dataset_info.json
new file mode 100644
index 000000000..930c057eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "sociology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77968,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262928,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68844,
+      "num_examples": 201,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7458,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/state.json
new file mode 100644
index 000000000..27d53645e
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b6dfd1c680db404d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..da014d7e4
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/dataset_info.json
new file mode 100644
index 000000000..930c057eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "sociology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77968,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262928,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68844,
+      "num_examples": 201,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7458,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/state.json
new file mode 100644
index 000000000..13f4d48f3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "445e5f0b6dc402d5",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..c6126a332
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/dataset_info.json
new file mode 100644
index 000000000..930c057eb
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "sociology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 77968,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166262928,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 68844,
+      "num_examples": 201,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 7458,
+      "num_examples": 22,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1666,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/state.json
new file mode 100644
index 000000000..9b3189b39
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/sociology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "92b9ea67f8f9d7c1",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0140f9cce
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/dataset_info.json
new file mode 100644
index 000000000..200439bf8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "us_foreign_policy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35718,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220678,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30531,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3483,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1704,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/state.json
new file mode 100644
index 000000000..88d7b978f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a0a530b6c3c1ce09",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..ed9a76267
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/dataset_info.json
new file mode 100644
index 000000000..200439bf8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "us_foreign_policy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35718,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220678,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30531,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3483,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1704,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/state.json
new file mode 100644
index 000000000..a3b2d3d14
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "060dd16f3358895b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..f12d68c5b
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/dataset_info.json
new file mode 100644
index 000000000..200439bf8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "us_foreign_policy",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 35718,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166220678,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 30531,
+      "num_examples": 100,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3483,
+      "num_examples": 11,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1704,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/state.json
new file mode 100644
index 000000000..575b19a25
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/us_foreign_policy/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "090d4a120ad1b821",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..99ac7222a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/dataset_info.json
new file mode 100644
index 000000000..e0d73ab9b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "virology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 47550,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166232510,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 40739,
+      "num_examples": 166,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5667,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1144,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/state.json
new file mode 100644
index 000000000..c8f333c71
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "6a93076594896b3e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..767d2ace0
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/dataset_info.json
new file mode 100644
index 000000000..e0d73ab9b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "virology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 47550,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166232510,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 40739,
+      "num_examples": 166,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5667,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1144,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/state.json
new file mode 100644
index 000000000..c127fb1e3
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c09188a0a273216a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5d01e8a26
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/dataset_info.json
new file mode 100644
index 000000000..e0d73ab9b
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "virology",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 47550,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166232510,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 40739,
+      "num_examples": 166,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5667,
+      "num_examples": 18,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 1144,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/state.json
new file mode 100644
index 000000000..d962e50a0
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/virology/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5fed6e3919aa025f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4fce96213
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/dataset_info.json
new file mode 100644
index 000000000..9871184f5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "world_religions",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 32378,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166217338,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28511,
+      "num_examples": 171,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3114,
+      "num_examples": 19,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 753,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/state.json
new file mode 100644
index 000000000..0308a6aef
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/dev/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "86cf0e1681022422",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "dev"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4a6c13e0c
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/dataset_info.json
new file mode 100644
index 000000000..9871184f5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "world_religions",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 32378,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166217338,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28511,
+      "num_examples": 171,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3114,
+      "num_examples": 19,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 753,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/state.json
new file mode 100644
index 000000000..344a3ef0f
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/test/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "187c421635b19f5d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..4dbbe469a
Binary files /dev/null and b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/dataset_info.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/dataset_info.json
new file mode 100644
index 000000000..9871184f5
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/dataset_info.json
@@ -0,0 +1,70 @@
+{
+  "builder_name": "mmlu_no_train",
+  "citation": "@article{hendryckstest2021,\n      title={Measuring Massive Multitask Language Understanding},\n      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},\n      journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n      year={2021}\n    }\n",
+  "config_name": "world_religions",
+  "dataset_name": "mmlu_no_train",
+  "dataset_size": 32378,
+  "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/cais/mmlu/resolve/main/data.tar": {
+      "num_bytes": 166184960,
+      "checksum": null
+    }
+  },
+  "download_size": 166184960,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "subject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "names": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/hendrycks/test",
+  "license": "",
+  "size_in_bytes": 166217338,
+  "splits": {
+    "test": {
+      "name": "test",
+      "num_bytes": 28511,
+      "num_examples": 171,
+      "dataset_name": "mmlu_no_train"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 3114,
+      "num_examples": 19,
+      "dataset_name": "mmlu_no_train"
+    },
+    "dev": {
+      "name": "dev",
+      "num_bytes": 753,
+      "num_examples": 5,
+      "dataset_name": "mmlu_no_train"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/state.json b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/state.json
new file mode 100644
index 000000000..e6a72d2b8
--- /dev/null
+++ b/olmo_data/hf_datasets/hails/mmlu_no_train/world_religions/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1f94975f68796ebf",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hellaswag/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/hellaswag/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..882ec4aed
Binary files /dev/null and b/olmo_data/hf_datasets/hellaswag/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/hellaswag/none/validation/dataset_info.json b/olmo_data/hf_datasets/hellaswag/none/validation/dataset_info.json
new file mode 100644
index 000000000..486e88bac
--- /dev/null
+++ b/olmo_data/hf_datasets/hellaswag/none/validation/dataset_info.json
@@ -0,0 +1,97 @@
+{
+  "builder_name": "hellaswag",
+  "citation": "@inproceedings{zellers2019hellaswag,\n    title={HellaSwag: Can a Machine Really Finish Your Sentence?},\n    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},\n    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},\n    year={2019}\n}\n",
+  "config_name": "default",
+  "dataset_name": "hellaswag",
+  "dataset_size": 65200194,
+  "description": "\nHellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.\n",
+  "download_checksums": {
+    "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl": {
+      "num_bytes": 47496131,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl": {
+      "num_bytes": 11752147,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl": {
+      "num_bytes": 12246618,
+      "checksum": null
+    }
+  },
+  "download_size": 71494896,
+  "features": {
+    "ind": {
+      "dtype": "int32",
+      "_type": "Value"
+    },
+    "activity_label": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "ctx_a": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "ctx_b": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "ctx": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "endings": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "source_id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "split": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "split_type": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://rowanzellers.com/hellaswag/",
+  "license": "",
+  "size_in_bytes": 136695090,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 43232624,
+      "num_examples": 39905,
+      "dataset_name": "hellaswag"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 10791853,
+      "num_examples": 10003,
+      "dataset_name": "hellaswag"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 11175717,
+      "num_examples": 10042,
+      "dataset_name": "hellaswag"
+    }
+  },
+  "version": {
+    "version_str": "0.1.0",
+    "major": 0,
+    "minor": 1,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/hellaswag/none/validation/state.json b/olmo_data/hf_datasets/hellaswag/none/validation/state.json
new file mode 100644
index 000000000..7f14e4245
--- /dev/null
+++ b/olmo_data/hf_datasets/hellaswag/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "9042f0834e9b2f00",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/nq_open/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/nq_open/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..d44073aa6
Binary files /dev/null and b/olmo_data/hf_datasets/nq_open/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/nq_open/none/validation/dataset_info.json b/olmo_data/hf_datasets/nq_open/none/validation/dataset_info.json
new file mode 100644
index 000000000..668266537
--- /dev/null
+++ b/olmo_data/hf_datasets/nq_open/none/validation/dataset_info.json
@@ -0,0 +1,55 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "nq_open",
+  "dataset_name": "nq_open",
+  "dataset_size": 6965065,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/nq_open@5dd9790a83002ad084ddeb7c420dc716852c6f28/nq_open/train-00000-of-00001.parquet": {
+      "num_bytes": 4464642,
+      "checksum": null
+    },
+    "hf://datasets/nq_open@5dd9790a83002ad084ddeb7c420dc716852c6f28/nq_open/validation-00000-of-00001.parquet": {
+      "num_bytes": 213603,
+      "checksum": null
+    }
+  },
+  "download_size": 4678245,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 11643310,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 6651236,
+      "num_examples": 87925,
+      "dataset_name": "nq_open"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 313829,
+      "num_examples": 3610,
+      "dataset_name": "nq_open"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/nq_open/none/validation/state.json b/olmo_data/hf_datasets/nq_open/none/validation/state.json
new file mode 100644
index 000000000..674de2c40
--- /dev/null
+++ b/olmo_data/hf_datasets/nq_open/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8f40661efe3d6e1c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/openbookqa/main/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/openbookqa/main/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..38bbf1b62
Binary files /dev/null and b/olmo_data/hf_datasets/openbookqa/main/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/openbookqa/main/validation/dataset_info.json b/olmo_data/hf_datasets/openbookqa/main/validation/dataset_info.json
new file mode 100644
index 000000000..bc69850cc
--- /dev/null
+++ b/olmo_data/hf_datasets/openbookqa/main/validation/dataset_info.json
@@ -0,0 +1,79 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "main",
+  "dataset_name": "openbookqa",
+  "dataset_size": 1082573,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/openbookqa@388097ea7776314e93a529163e0fea805b8a6454/main/train-00000-of-00001.parquet": {
+      "num_bytes": 495845,
+      "checksum": null
+    },
+    "hf://datasets/openbookqa@388097ea7776314e93a529163e0fea805b8a6454/main/validation-00000-of-00001.parquet": {
+      "num_bytes": 58233,
+      "checksum": null
+    },
+    "hf://datasets/openbookqa@388097ea7776314e93a529163e0fea805b8a6454/main/test-00000-of-00001.parquet": {
+      "num_bytes": 55535,
+      "checksum": null
+    }
+  },
+  "download_size": 609613,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_stem": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "text": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "label": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 1692186,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 895386,
+      "num_examples": 4957,
+      "dataset_name": "openbookqa"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 95428,
+      "num_examples": 500,
+      "dataset_name": "openbookqa"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 91759,
+      "num_examples": 500,
+      "dataset_name": "openbookqa"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/openbookqa/main/validation/state.json b/olmo_data/hf_datasets/openbookqa/main/validation/state.json
new file mode 100644
index 000000000..47a5cb28d
--- /dev/null
+++ b/olmo_data/hf_datasets/openbookqa/main/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f77d50ae1177c468",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/piqa/plain_text/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/piqa/plain_text/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..9a460181e
Binary files /dev/null and b/olmo_data/hf_datasets/piqa/plain_text/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/piqa/plain_text/validation/dataset_info.json b/olmo_data/hf_datasets/piqa/plain_text/validation/dataset_info.json
new file mode 100644
index 000000000..6d073d396
--- /dev/null
+++ b/olmo_data/hf_datasets/piqa/plain_text/validation/dataset_info.json
@@ -0,0 +1,69 @@
+{
+  "builder_name": "piqa",
+  "citation": "@inproceedings{Bisk2020,\n  author = {Yonatan Bisk and Rowan Zellers and\n            Ronan Le Bras and Jianfeng Gao\n            and Yejin Choi},\n  title = {PIQA: Reasoning about Physical Commonsense in\n           Natural Language},\n  booktitle = {Thirty-Fourth AAAI Conference on\n               Artificial Intelligence},\n  year = {2020},\n}\n",
+  "config_name": "plain_text",
+  "dataset_name": "piqa",
+  "dataset_size": 5329820,
+  "description": "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick?\nQuestions requiring this kind of physical commonsense pose a challenge to state-of-the-art\nnatural language understanding systems. The PIQA dataset introduces the task of physical commonsense reasoning\nand a corresponding benchmark dataset Physical Interaction: Question Answering or PIQA.\n\nPhysical commonsense knowledge is a major challenge on the road to true AI-completeness,\nincluding robots that interact with the world and understand natural language.\n\nPIQA focuses on everyday situations with a preference for atypical solutions.\nThe dataset is inspired by instructables.com, which provides users with instructions on how to build, craft,\nbake, or manipulate objects using everyday materials.\n\nThe underlying task is formualted as multiple choice question answering:\ngiven a question `q` and two possible solutions `s1`, `s2`, a model or\na human must choose the most appropriate solution, of which exactly one is correct.\nThe dataset is further cleaned of basic artifacts using the AFLite algorithm which is an improvement of\nadversarial filtering. The dataset contains 16,000 examples for training, 2,000 for development and 3,000 for testing.\n",
+  "download_checksums": {
+    "https://storage.googleapis.com/ai2-mosaic/public/physicaliqa/physicaliqa-train-dev.zip": {
+      "num_bytes": 1824009,
+      "checksum": null
+    },
+    "https://yonatanbisk.com/piqa/data/tests.jsonl": {
+      "num_bytes": 814616,
+      "checksum": null
+    }
+  },
+  "download_size": 2638625,
+  "features": {
+    "goal": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "sol1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "sol2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "0",
+        "1"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://yonatanbisk.com/piqa/",
+  "license": "",
+  "size_in_bytes": 7968445,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 4104002,
+      "num_examples": 16113,
+      "dataset_name": "piqa"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 761509,
+      "num_examples": 3084,
+      "dataset_name": "piqa"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 464309,
+      "num_examples": 1838,
+      "dataset_name": "piqa"
+    }
+  },
+  "version": {
+    "version_str": "1.1.0",
+    "major": 1,
+    "minor": 1,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/piqa/plain_text/validation/state.json b/olmo_data/hf_datasets/piqa/plain_text/validation/state.json
new file mode 100644
index 000000000..c53a36e69
--- /dev/null
+++ b/olmo_data/hf_datasets/piqa/plain_text/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c305b46a7e84e936",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/sciq/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/sciq/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..b890b7685
Binary files /dev/null and b/olmo_data/hf_datasets/sciq/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/sciq/none/validation/dataset_info.json b/olmo_data/hf_datasets/sciq/none/validation/dataset_info.json
new file mode 100644
index 000000000..66bf12f20
--- /dev/null
+++ b/olmo_data/hf_datasets/sciq/none/validation/dataset_info.json
@@ -0,0 +1,78 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "sciq",
+  "dataset_size": 7664230,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/sciq@2c94ad3e1aafab77146f384e23536f97a4849815/data/train-00000-of-00001.parquet": {
+      "num_bytes": 3993099,
+      "checksum": null
+    },
+    "hf://datasets/sciq@2c94ad3e1aafab77146f384e23536f97a4849815/data/validation-00000-of-00001.parquet": {
+      "num_bytes": 338503,
+      "checksum": null
+    },
+    "hf://datasets/sciq@2c94ad3e1aafab77146f384e23536f97a4849815/data/test-00000-of-00001.parquet": {
+      "num_bytes": 342808,
+      "checksum": null
+    }
+  },
+  "download_size": 4674410,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "distractor3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "distractor1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "distractor2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "correct_answer": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "support": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 12338640,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 6546183,
+      "num_examples": 11679,
+      "dataset_name": "sciq"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 554120,
+      "num_examples": 1000,
+      "dataset_name": "sciq"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 563927,
+      "num_examples": 1000,
+      "dataset_name": "sciq"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/sciq/none/validation/state.json b/olmo_data/hf_datasets/sciq/none/validation/state.json
new file mode 100644
index 000000000..d277bcb2a
--- /dev/null
+++ b/olmo_data/hf_datasets/sciq/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f4b2e70569612ae4",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/social_i_qa/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/social_i_qa/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..035a5f4ae
Binary files /dev/null and b/olmo_data/hf_datasets/social_i_qa/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/social_i_qa/none/validation/dataset_info.json b/olmo_data/hf_datasets/social_i_qa/none/validation/dataset_info.json
new file mode 100644
index 000000000..8809ca0ec
--- /dev/null
+++ b/olmo_data/hf_datasets/social_i_qa/none/validation/dataset_info.json
@@ -0,0 +1,64 @@
+{
+  "builder_name": "social_i_qa",
+  "citation": "\n",
+  "config_name": "default",
+  "dataset_name": "social_i_qa",
+  "dataset_size": 6700024,
+  "description": "We introduce Social IQa: Social Interaction QA, a new question-answering benchmark for testing social commonsense intelligence. Contrary to many prior benchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on reasoning about people\u2019s actions and their social implications. For example, given an action like \"Jesse saw a concert\" and a question like \"Why did Jesse do this?\", humans can easily infer that Jesse wanted \"to see their favorite performer\" or \"to enjoy the music\", and not \"to see what's happening inside\" or \"to see if it works\". The actions in Social IQa span a wide variety of social situations, and answer candidates contain both human-curated answers and adversarially-filtered machine-generated candidates. Social IQa contains over 37,000 QA pairs for evaluating models\u2019 abilities to reason about the social implications of everyday events and situations. (Less)\n",
+  "download_checksums": {
+    "https://storage.googleapis.com/ai2-mosaic/public/socialiqa/socialiqa-train-dev.zip": {
+      "num_bytes": 2198056,
+      "checksum": null
+    }
+  },
+  "download_size": 2198056,
+  "features": {
+    "context": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answerA": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answerB": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answerC": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://leaderboard.allenai.org/socialiqa/submissions/get-started",
+  "license": "",
+  "size_in_bytes": 8898080,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 6327209,
+      "num_examples": 33410,
+      "dataset_name": "social_i_qa"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 372815,
+      "num_examples": 1954,
+      "dataset_name": "social_i_qa"
+    }
+  },
+  "version": {
+    "version_str": "0.1.0",
+    "major": 0,
+    "minor": 1,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/social_i_qa/none/validation/state.json b/olmo_data/hf_datasets/social_i_qa/none/validation/state.json
new file mode 100644
index 000000000..e90459ae5
--- /dev/null
+++ b/olmo_data/hf_datasets/social_i_qa/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a74b93e67cb9ff1e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/super_glue/cb/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/super_glue/cb/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..27a60b06e
Binary files /dev/null and b/olmo_data/hf_datasets/super_glue/cb/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/super_glue/cb/validation/dataset_info.json b/olmo_data/hf_datasets/super_glue/cb/validation/dataset_info.json
new file mode 100644
index 000000000..a7657eb24
--- /dev/null
+++ b/olmo_data/hf_datasets/super_glue/cb/validation/dataset_info.json
@@ -0,0 +1,66 @@
+{
+  "builder_name": "super_glue",
+  "citation": "@article{de marneff_simons_tonhauser_2019,\n  title={The CommitmentBank: Investigating projection in naturally occurring discourse},\n  journal={proceedings of Sinn und Bedeutung 23},\n  author={De Marneff, Marie-Catherine and Simons, Mandy and Tonhauser, Judith},\n  year={2019}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n",
+  "config_name": "cb",
+  "dataset_name": "super_glue",
+  "dataset_size": 202393,
+  "description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe CommitmentBank (De Marneffe et al., 2019) is a corpus of short texts in which at least\none sentence contains an embedded clause. Each of these embedded clauses is annotated with the\ndegree to which we expect that the person who wrote the text is committed to the truth of the clause.\nThe resulting task framed as three-class textual entailment on examples that are drawn from the Wall\nStreet Journal, fiction from the British National Corpus, and Switchboard. Each example consists\nof a premise containing an embedded clause and the corresponding hypothesis is the extraction of\nthat clause. We use a subset of the data that had inter-annotator agreement above 0.85. The data is\nimbalanced (relatively fewer neutral examples), so we evaluate using accuracy and F1, where for\nmulti-class F1 we compute the unweighted average of the F1 per class.",
+  "download_checksums": {
+    "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip": {
+      "num_bytes": 75482,
+      "checksum": null
+    }
+  },
+  "download_size": 75482,
+  "features": {
+    "premise": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "hypothesis": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "idx": {
+      "dtype": "int32",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "entailment",
+        "contradiction",
+        "neutral"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "https://github.com/mcdm/CommitmentBank",
+  "license": "",
+  "size_in_bytes": 277875,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 87050,
+      "num_examples": 250,
+      "dataset_name": "super_glue"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 21851,
+      "num_examples": 56,
+      "dataset_name": "super_glue"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 93492,
+      "num_examples": 250,
+      "dataset_name": "super_glue"
+    }
+  },
+  "version": {
+    "version_str": "1.0.3",
+    "major": 1,
+    "minor": 0,
+    "patch": 3
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/super_glue/cb/validation/state.json b/olmo_data/hf_datasets/super_glue/cb/validation/state.json
new file mode 100644
index 000000000..77ab9028a
--- /dev/null
+++ b/olmo_data/hf_datasets/super_glue/cb/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "21634d8a438233eb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/super_glue/copa/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/super_glue/copa/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..0bd85038f
Binary files /dev/null and b/olmo_data/hf_datasets/super_glue/copa/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/super_glue/copa/validation/dataset_info.json b/olmo_data/hf_datasets/super_glue/copa/validation/dataset_info.json
new file mode 100644
index 000000000..af5a62539
--- /dev/null
+++ b/olmo_data/hf_datasets/super_glue/copa/validation/dataset_info.json
@@ -0,0 +1,73 @@
+{
+  "builder_name": "super_glue",
+  "citation": "@inproceedings{roemmele2011choice,\n  title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},\n  author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},\n  booktitle={2011 AAAI Spring Symposium Series},\n  year={2011}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n",
+  "config_name": "copa",
+  "dataset_name": "super_glue",
+  "dataset_size": 121558,
+  "description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Choice Of Plausible Alternatives (COPA, Roemmele et al., 2011) dataset is a causal\nreasoning task in which a system is given a premise sentence and two possible alternatives. The\nsystem must choose the alternative which has the more plausible causal relationship with the premise.\nThe method used for the construction of the alternatives ensures that the task requires causal reasoning\nto solve. Examples either deal with alternative possible causes or alternative possible effects of the\npremise sentence, accompanied by a simple question disambiguating between the two instance\ntypes for the model. All examples are handcrafted and focus on topics from online blogs and a\nphotography-related encyclopedia. Following the recommendation of the authors, we evaluate using\naccuracy.",
+  "download_checksums": {
+    "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip": {
+      "num_bytes": 43986,
+      "checksum": null
+    }
+  },
+  "download_size": 43986,
+  "features": {
+    "premise": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choice1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choice2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "idx": {
+      "dtype": "int32",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "choice1",
+        "choice2"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "http://people.ict.usc.edu/~gordon/copa.html",
+  "license": "",
+  "size_in_bytes": 165544,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 49233,
+      "num_examples": 400,
+      "dataset_name": "super_glue"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 12479,
+      "num_examples": 100,
+      "dataset_name": "super_glue"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 59846,
+      "num_examples": 500,
+      "dataset_name": "super_glue"
+    }
+  },
+  "version": {
+    "version_str": "1.0.3",
+    "major": 1,
+    "minor": 0,
+    "patch": 3
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/super_glue/copa/validation/state.json b/olmo_data/hf_datasets/super_glue/copa/validation/state.json
new file mode 100644
index 000000000..4524fcf7f
--- /dev/null
+++ b/olmo_data/hf_datasets/super_glue/copa/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c785b24eeb544f2a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..5d4cf5846
Binary files /dev/null and b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/dataset_info.json b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/dataset_info.json
new file mode 100644
index 000000000..f4275f04b
--- /dev/null
+++ b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/dataset_info.json
@@ -0,0 +1,83 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "commonsense_qa",
+  "dataset_size": 2739484,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/tau/commonsense_qa@94630fe30dad47192a8546eb75f094926d47e155/data/train-00000-of-00001.parquet": {
+      "num_bytes": 1247103,
+      "checksum": null
+    },
+    "hf://datasets/tau/commonsense_qa@94630fe30dad47192a8546eb75f094926d47e155/data/validation-00000-of-00001.parquet": {
+      "num_bytes": 160240,
+      "checksum": null
+    },
+    "hf://datasets/tau/commonsense_qa@94630fe30dad47192a8546eb75f094926d47e155/data/test-00000-of-00001.parquet": {
+      "num_bytes": 151227,
+      "checksum": null
+    }
+  },
+  "download_size": 1558570,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_concept": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "feature": {
+        "label": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "text": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 4298054,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 2207794,
+      "num_examples": 9741,
+      "dataset_name": "commonsense_qa"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 273848,
+      "num_examples": 1221,
+      "dataset_name": "commonsense_qa"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 257842,
+      "num_examples": 1140,
+      "dataset_name": "commonsense_qa"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/state.json b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/state.json
new file mode 100644
index 000000000..2dbb62d13
--- /dev/null
+++ b/olmo_data/hf_datasets/tau/commonsense_qa/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fbff160a071447de",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..edf5cccb4
Binary files /dev/null and b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/dataset_info.json b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/dataset_info.json
new file mode 100644
index 000000000..a1a5ebdd9
--- /dev/null
+++ b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/dataset_info.json
@@ -0,0 +1,152 @@
+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "rc.wikipedia.nocontext",
+  "dataset_name": "trivia_qa",
+  "dataset_size": 56368074,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/trivia_qa@0f7faf33a3908546c6fd5b73a660e0f8ff173c2f/rc.wikipedia.nocontext/train-00000-of-00001.parquet": {
+      "num_bytes": 24953647,
+      "checksum": null
+    },
+    "hf://datasets/trivia_qa@0f7faf33a3908546c6fd5b73a660e0f8ff173c2f/rc.wikipedia.nocontext/validation-00000-of-00001.parquet": {
+      "num_bytes": 3308451,
+      "checksum": null
+    },
+    "hf://datasets/trivia_qa@0f7faf33a3908546c6fd5b73a660e0f8ff173c2f/rc.wikipedia.nocontext/test-00000-of-00001.parquet": {
+      "num_bytes": 541852,
+      "checksum": null
+    }
+  },
+  "download_size": 28803950,
+  "features": {
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_source": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "entity_pages": {
+      "feature": {
+        "doc_source": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "filename": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "title": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "wiki_context": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "search_results": {
+      "feature": {
+        "description": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "filename": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "rank": {
+          "dtype": "int32",
+          "_type": "Value"
+        },
+        "title": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "url": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "search_context": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      },
+      "_type": "Sequence"
+    },
+    "answer": {
+      "aliases": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "normalized_aliases": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "matched_wiki_entity_name": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "normalized_matched_wiki_entity_name": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "normalized_value": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "type": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "value": {
+        "dtype": "string",
+        "_type": "Value"
+      }
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 85172024,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 48359645,
+      "num_examples": 61888,
+      "dataset_name": "trivia_qa"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 6365273,
+      "num_examples": 7993,
+      "dataset_name": "trivia_qa"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 1643156,
+      "num_examples": 7701,
+      "dataset_name": "trivia_qa"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/state.json b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/state.json
new file mode 100644
index 000000000..64e10fe35
--- /dev/null
+++ b/olmo_data/hf_datasets/trivia_qa/rc.wikipedia.nocontext/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "103745879378fe4b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/data-00000-of-00001.arrow
new file mode 100644
index 000000000..74abc3b7d
Binary files /dev/null and b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/data-00000-of-00001.arrow differ
diff --git a/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/dataset_info.json b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/dataset_info.json
new file mode 100644
index 000000000..021713ca7
--- /dev/null
+++ b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/dataset_info.json
@@ -0,0 +1,63 @@
+{
+  "builder_name": "winogrande",
+  "citation": "@InProceedings{ai2:winogrande,\ntitle = {WinoGrande: An Adversarial Winograd Schema Challenge at Scale},\nauthors={Keisuke, Sakaguchi and Ronan, Le Bras and Chandra, Bhagavatula and Yejin, Choi\n},\nyear={2019}\n}\n",
+  "config_name": "winogrande_xl",
+  "dataset_name": "winogrande",
+  "dataset_size": 5577568,
+  "description": "WinoGrande is a new collection of 44k problems, inspired by Winograd Schema Challenge (Levesque, Davis, and Morgenstern\n 2011), but adjusted to improve the scale and robustness against the dataset-specific bias. Formulated as a\nfill-in-a-blank task with binary options, the goal is to choose the right option for a given sentence which requires\ncommonsense reasoning.\n",
+  "download_checksums": {
+    "https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip": {
+      "num_bytes": 3395492,
+      "checksum": null
+    }
+  },
+  "download_size": 3395492,
+  "features": {
+    "sentence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "option1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "option2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://leaderboard.allenai.org/winogrande/submissions/get-started",
+  "license": "",
+  "size_in_bytes": 8973060,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 5185752,
+      "num_examples": 40398,
+      "dataset_name": "winogrande"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 227633,
+      "num_examples": 1767,
+      "dataset_name": "winogrande"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 164183,
+      "num_examples": 1267,
+      "dataset_name": "winogrande"
+    }
+  },
+  "version": {
+    "version_str": "1.1.0",
+    "description": "",
+    "major": 1,
+    "minor": 1,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/state.json b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/state.json
new file mode 100644
index 000000000..f8d619900
--- /dev/null
+++ b/olmo_data/hf_datasets/winogrande/winogrande_xl/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1b3333f502a889bb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}
\ No newline at end of file
diff --git a/tests/util_test.py b/tests/util_test.py
index a01a10037..7aa031215 100644
--- a/tests/util_test.py
+++ b/tests/util_test.py
@@ -1,9 +1,3 @@
-import json
-from pathlib import Path
-from typing import Any, List
-
-from datasets import Dataset, DatasetDict
-
 from olmo import util
 
 
@@ -18,43 +12,3 @@ def test_dir_is_empty(tmp_path):
     # Should return false if dir contains anything, even hidden files.
     (dir / ".foo").touch()
     assert not util.dir_is_empty(dir)
-
-
-def _create_and_store_test_hf_dataset(data: List[Any], dataset_path: Path):
-    dataset_path.mkdir(parents=True, exist_ok=True)
-    test_file_path = dataset_path / "test.json"
-    with test_file_path.open("w") as f:
-        json.dump(data, f)
-
-
-def test_load_hf_dataset_gets_correct_data(tmp_path: Path):
-    dataset_path = tmp_path / "test_dataset"
-    cache_path = tmp_path / "cache"
-
-    data = [{"foo": i} for i in range(10)]
-    _create_and_store_test_hf_dataset(data, dataset_path)
-
-    dataset = util.load_hf_dataset(str(dataset_path), name=None, split="test", datasets_cache_dir=str(cache_path))
-    assert isinstance(dataset, (Dataset, DatasetDict))
-    for i in range(10):
-        assert dataset[i]["foo"] == i
-
-
-def test_load_hf_dataset_caches_dataset(tmp_path: Path):
-    dataset_path = tmp_path / "test_dataset"
-    cache_path = tmp_path / "cache"
-
-    data = [{"foo": i} for i in range(10)]
-    _create_and_store_test_hf_dataset(data, dataset_path)
-
-    dataset = util.load_hf_dataset(str(dataset_path), name=None, split="test", datasets_cache_dir=str(cache_path))
-    assert isinstance(dataset, (Dataset, DatasetDict))
-    assert dataset[0]["foo"] == 0
-
-    # Overwrite dataset data and check that old data is loaded
-    data = [{"bar": i} for i in range(10)]
-    _create_and_store_test_hf_dataset(data, dataset_path)
-
-    dataset = util.load_hf_dataset(str(dataset_path), name=None, split="test", datasets_cache_dir=str(cache_path))
-    assert isinstance(dataset, (Dataset, DatasetDict))
-    assert dataset[0]["foo"] == 0