Skip to content

Commit

Permalink
Add FMS datasets (#1)
Browse files Browse the repository at this point in the history
Co-authored-by: Linsong Chu <[email protected]>
  • Loading branch information
daviswer and lchu-ibm authored May 31, 2024
1 parent 6a8455e commit 2e733d4
Show file tree
Hide file tree
Showing 15 changed files with 1,350 additions and 11 deletions.
1 change: 0 additions & 1 deletion torchtitan/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def __init__(
"model": ModelWrapper(model),
"optimizer": OptimizerWrapper(model, optimizer),
"lr_scheduler": lr_scheduler,
"dataloader": dataloader,
}
)

Expand Down
43 changes: 43 additions & 0 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,49 @@ def __init__(self):
help="Python garbage control scheduling interval, in steps",
)

# experimental dataloader flags
self.parser.add_argument(
"--dataset.use_experimental_dataloader",
action="store_true",
help="Whether to use the experimental dataloader instead of default HF",
)
self.parser.add_argument(
"--dataset.data_logical_shards",
type=int,
default=768,
help="Dataloader logical shards. All divisors are possible world sizes.",
)
self.parser.add_argument(
"--dataset.bos_token",
type=int,
default=-1,
help="BOS token index value. If not using, leave as -1.",
)
self.parser.add_argument(
"--dataset.eos_token",
type=int,
default=0,
help="EOS or SEP token index value.",
)
self.parser.add_argument(
"--dataset.drop_tokens",
type=str,
default="",
help="Dummy token values to drop from begin/end of sequences (comma-separated ints)",
)
self.parser.add_argument(
"--dataset.datasets",
type=str,
default="c4_mini",
help="Datasets to use for training, comma-separated",
)
self.parser.add_argument(
"--dataset.dataset_weights",
type=str,
default="1",
help="Sampling ratios for sub-datasets, comma-separated. Do not need to sum to 1.",
)

# checkpointing configs
self.parser.add_argument(
"--checkpoint.enable_checkpoint",
Expand Down
2 changes: 2 additions & 0 deletions torchtitan/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from torchtitan.datasets.experimental_datasets import build_experimental_data_loader
from torchtitan.datasets.hf_datasets import build_hf_data_loader
from torchtitan.datasets.tokenizer import create_tokenizer

__all__ = [
"build_hf_data_loader",
"build_experimental_data_loader",
"create_tokenizer",
]
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset/filename,documents,tokens
/c4_mini/c4_mini.arrow,45000,24174478
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset/filename,documents,tokens
/c4_mini/c4_mini.arrow,45000,20505558
Loading

0 comments on commit 2e733d4

Please sign in to comment.