Skip to content

Commit

Permalink
warn supported dataset checks instead of throw (#260)
Browse files Browse the repository at this point in the history
  • Loading branch information
wanchaol authored Apr 24, 2024
1 parent e1c116a commit be432e1
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions torchtitan/datasets/hf_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,6 @@ def __init__(
rank: int = 0,
infinite: bool = False,
) -> None:
if dataset_name not in _supported_datasets:
raise ValueError(
f"Dataset {dataset_name} is not supported. "
f"Supported datasets are: {_supported_datasets.keys()}."
)
# special case to auto-load c4_mini (and any future datasets) from local dir
if dataset_name == "c4_mini":
dataset_path = f"torchtitan/datasets/{dataset_name}"
Expand All @@ -84,6 +79,13 @@ def __init__(
logger.info(f"Preparing {dataset_name} dataset from HuggingFace")
# Setting `streaming=True` works for large dataset, but is slightly
# slower and unstable.
if dataset_name not in _supported_datasets:
import warnings

warnings.warn(
f"Dataset {dataset_name} is not tested/verfied. "
f"Recommended datasets are: {_supported_datasets.keys()}."
)
if dataset_name == "c4":
# c4 is huge, and requires both streaming and language selection
# (we default to en).
Expand Down

0 comments on commit be432e1

Please sign in to comment.