ray-project · clarkzinzow · Oct 26, 2022 · Sep 27, 2022 · Oct 4, 2022 · Oct 5, 2022
@@ -602,6 +602,47 @@ def drop_columns(
             lambda batch: batch.drop(columns=cols), compute=compute, **ray_remote_args
         )
 
+    def select_columns(
+        self,
+        columns: List[str],
+        *,
+        compute: Union[str, ComputeStrategy] = None,
+        **ray_remote_args,
+    ) -> "Dataset[T]":
+        """Select one or more columns from the dataset.
+
+        Columns passed in will be de-duped since ArrowBlock and PandasBlock
+        `select` does not explicitly handle duplicated columns.
+
+        Examples:
+            >>> import ray
+            >>> # Create a dataset with 3 columns
+            >>> ds = ray.data.from_items([{"col1": i, "col2": i+1, "col3": i+2}
+            ...      for i in range(10)])
+            >>> # Select only "col1" and "col2" columns.
+            >>> ds = ds.select_columns(["col1", "col2"])
+            >>> ds
+            Dataset(num_blocks=10, num_rows=10, schema={col1: int64, col2: int64})
+
+
+        Time complexity: O(dataset size / parallelism)
+
+        Args:
+            columns: Names of the columns to select. Columns not included in this
+                will be filtered out.
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, or ActorPoolStrategy(min, max) to use an autoscaling actor pool.
+            ray_remote_args: Additional resource requirements to request from
+                ray (e.g., num_gpus=1 to request GPUs for the map tasks).
+        """
+        # dedup the input columns used for selection
+        unique_columns = list(set(columns))
+        return self.map_batches(
+            lambda batch: BlockAccessor.for_block(batch).select(columns=unique_columns),
+            compute=compute,
+            **ray_remote_args,
+        )
+
     def flat_map(
         self,
         fn: RowUDF[T, U],

@@ -2224,6 +2224,28 @@ def test_drop_columns(ray_start_regular_shared, tmp_path):
             ds.drop_columns(["dummy_col", "col1", "col2"])
 
 
+def test_select_columns(ray_start_regular_shared):
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4], "col3": [3, 4, 5]})
+    # Test pandas and base cases
+    ds = ray.data.from_pandas(df)
+    assert ds._dataset_format() == "pandas"
+    assert ds.select_columns(columns=["col1", "col2", "col3"]).take(1) == [
+        {"col1": 1, "col2": 2, "col3": 3}
+    ]
+    assert ds.select_columns(columns=["col1", "col2"]).take(1) == [
+        {"col1": 1, "col2": 2}
+    ]
+    assert ds.select_columns(columns=[]).take(1) == [{}]
+    assert ds.select_columns(columns=["col1", "col2", "col2"]).take(1) == [
+        {"col1": 1, "col2": 2}
+    ]
+
+    # Test arrow
+    ds = ds.select_columns(columns=["col1", "col2"], batch_format="pyarrow")
+    assert ds._dataset_format() == "arrow"
+    assert ds.take(1) == [{"col1": 1, "col2": 2}]
+
+
 def test_map_batches_basic(ray_start_regular_shared, tmp_path):
     # Test input validation
     ds = ray.data.range(5)