Add FMS datasets (#1)

Co-authored-by: Linsong Chu <[email protected]>
YerevaNN · May 31, 2024 · 2e733d4 · 2e733d4
1 parent 6a8455e
commit 2e733d4
Show file tree

Hide file tree

Showing 15 changed files with 1,350 additions and 11 deletions.
diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py
@@ -120,7 +120,6 @@ def __init__(
                 "model": ModelWrapper(model),
                 "optimizer": OptimizerWrapper(model, optimizer),
                 "lr_scheduler": lr_scheduler,
-                "dataloader": dataloader,
             }
         )
 

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -329,6 +329,49 @@ def __init__(self):
             help="Python garbage control scheduling interval, in steps",
         )
 
+        # experimental dataloader flags
+        self.parser.add_argument(
+            "--dataset.use_experimental_dataloader",
+            action="store_true",
+            help="Whether to use the experimental dataloader instead of default HF",
+        )
+        self.parser.add_argument(
+            "--dataset.data_logical_shards",
+            type=int,
+            default=768,
+            help="Dataloader logical shards. All divisors are possible world sizes.",
+        )
+        self.parser.add_argument(
+            "--dataset.bos_token",
+            type=int,
+            default=-1,
+            help="BOS token index value. If not using, leave as -1.",
+        )
+        self.parser.add_argument(
+            "--dataset.eos_token",
+            type=int,
+            default=0,
+            help="EOS or SEP token index value.",
+        )
+        self.parser.add_argument(
+            "--dataset.drop_tokens",
+            type=str,
+            default="",
+            help="Dummy token values to drop from begin/end of sequences (comma-separated ints)",
+        )
+        self.parser.add_argument(
+            "--dataset.datasets",
+            type=str,
+            default="c4_mini",
+            help="Datasets to use for training, comma-separated",
+        )
+        self.parser.add_argument(
+            "--dataset.dataset_weights",
+            type=str,
+            default="1",
+            help="Sampling ratios for sub-datasets, comma-separated. Do not need to sum to 1.",
+        )
+
         # checkpointing configs
         self.parser.add_argument(
             "--checkpoint.enable_checkpoint",

diff --git a/torchtitan/datasets/__init__.py b/torchtitan/datasets/__init__.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from torchtitan.datasets.experimental_datasets import build_experimental_data_loader
 from torchtitan.datasets.hf_datasets import build_hf_data_loader
 from torchtitan.datasets.tokenizer import create_tokenizer
 
 __all__ = [
     "build_hf_data_loader",
+    "build_experimental_data_loader",
     "create_tokenizer",
 ]
diff --git a/torchtitan/datasets/experimental/llama2/c4_mini/c4_mini.arrow b/torchtitan/datasets/experimental/llama2/c4_mini/c4_mini.arrow
diff --git a/torchtitan/datasets/experimental/llama2/meta/c4_llama2_counts.csv b/torchtitan/datasets/experimental/llama2/meta/c4_llama2_counts.csv
@@ -0,0 +1,2 @@
+dataset/filename,documents,tokens
+/c4_mini/c4_mini.arrow,45000,24174478
diff --git a/torchtitan/datasets/experimental/llama3/c4_mini/c4_mini.arrow b/torchtitan/datasets/experimental/llama3/c4_mini/c4_mini.arrow
diff --git a/torchtitan/datasets/experimental/llama3/meta/c4_llama3_counts.csv b/torchtitan/datasets/experimental/llama3/meta/c4_llama3_counts.csv
@@ -0,0 +1,2 @@
+dataset/filename,documents,tokens
+/c4_mini/c4_mini.arrow,45000,20505558