From 43ed1d7220c681a2c9287a4bd47e0f5debbb8a71 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 30 Nov 2023 13:29:59 -0800 Subject: [PATCH] Rename LegacyTable -> Table --- benchmarking/parquet/conftest.py | 2 +- daft/table/__init__.py | 8 +++----- .../io/parquet/test_reads_public_data.py | 6 +++--- tests/table/test_partitioning.py | 16 ++++++++-------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/benchmarking/parquet/conftest.py b/benchmarking/parquet/conftest.py index bea8b2a99d..978d0cf8a9 100644 --- a/benchmarking/parquet/conftest.py +++ b/benchmarking/parquet/conftest.py @@ -76,7 +76,7 @@ def fn(files: list[str]) -> list[pa.Table]: def daft_bulk_read(paths: list[str], columns: list[str] | None = None) -> list[pa.Table]: - tables = daft.table.LegacyTable.read_parquet_bulk(paths, columns=columns) + tables = daft.table.Table.read_parquet_bulk(paths, columns=columns) return [t.to_arrow() for t in tables] diff --git a/daft/table/__init__.py b/daft/table/__init__.py index 73a557aba9..5b66e7b616 100644 --- a/daft/table/__init__.py +++ b/daft/table/__init__.py @@ -2,19 +2,17 @@ import os -from .table import Table as _Table -from .table import read_parquet_into_pyarrow, read_parquet_into_pyarrow_bulk +from .table import Table, read_parquet_into_pyarrow, read_parquet_into_pyarrow_bulk # Need to import after `.table` due to circular dep issue otherwise from .micropartition import MicroPartition as _MicroPartition # isort:skip -LegacyTable = _Table MicroPartition = _MicroPartition # Use $DAFT_MICROPARTITIONS envvar as a feature flag to turn off MicroPartitions if os.getenv("DAFT_MICROPARTITIONS", "1") != "1": - MicroPartition = LegacyTable # type: ignore + MicroPartition = Table # type: ignore -__all__ = ["MicroPartition", "LegacyTable", "read_parquet_into_pyarrow", "read_parquet_into_pyarrow_bulk"] +__all__ = ["MicroPartition", "Table", "read_parquet_into_pyarrow", "read_parquet_into_pyarrow_bulk"] diff --git a/tests/integration/io/parquet/test_reads_public_data.py b/tests/integration/io/parquet/test_reads_public_data.py index 0b1dc21071..d7c155857b 100644 --- a/tests/integration/io/parquet/test_reads_public_data.py +++ b/tests/integration/io/parquet/test_reads_public_data.py @@ -8,7 +8,7 @@ import daft from daft.filesystem import get_filesystem, get_protocol_from_path -from daft.table import LegacyTable, MicroPartition +from daft.table import MicroPartition, Table def get_filesystem_from_path(path: str, **kwargs) -> fsspec.AbstractFileSystem: @@ -253,7 +253,7 @@ def test_parquet_read_table_bulk(parquet_file, public_storage_io_config, multith pa_read = MicroPartition.from_arrow(read_parquet_with_pyarrow(url)) # Legacy Table returns a list[Table] - if MicroPartition == LegacyTable: + if MicroPartition == Table: for daft_native_read in daft_native_reads: assert daft_native_read.schema() == pa_read.schema() pd.testing.assert_frame_equal(daft_native_read.to_pandas(), pa_read.to_pandas()) @@ -337,7 +337,7 @@ def test_row_groups_selection_bulk(public_storage_io_config, multithreaded_io): url = ["s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet"] * 11 row_groups = [list(range(10))] + [[i] for i in range(10)] - if MicroPartition == LegacyTable: + if MicroPartition == Table: first, *rest = MicroPartition.read_parquet_bulk( url, io_config=public_storage_io_config, multithreaded_io=multithreaded_io, row_groups_per_path=row_groups ) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 96a7e93c76..1ca60634cc 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -9,7 +9,7 @@ from daft.datatype import DataType from daft.expressions import col from daft.logical.schema import Schema -from daft.table import LegacyTable, MicroPartition +from daft.table import MicroPartition, Table daft_int_types = [ DataType.int8(), @@ -67,7 +67,7 @@ def test_partitioning_micropartitions_hash(mp) -> None: ], ) def test_partitioning_micropartitions_range_empty(mp) -> None: - boundaries = LegacyTable.from_pydict({"a": np.linspace(0, 10, 3)[1:]}).eval_expression_list( + boundaries = Table.from_pydict({"a": np.linspace(0, 10, 3)[1:]}).eval_expression_list( [col("a").cast(DataType.int64())] ) split_tables = mp.partition_by_range([col("a")], boundaries, [True]) @@ -85,7 +85,7 @@ def test_partitioning_micropartitions_range_empty(mp) -> None: ], ) def test_partitioning_micropartitions_range_boundaries_empty(mp) -> None: - boundaries = LegacyTable.from_pydict({"a": [], "b": []}).eval_expression_list([col("a").cast(DataType.int64())]) + boundaries = Table.from_pydict({"a": [], "b": []}).eval_expression_list([col("a").cast(DataType.int64())]) split_tables = mp.partition_by_range([col("a"), col("b")], boundaries, [False, False]) assert len(split_tables) == 1 assert split_tables[0].to_pydict() == {"a": [], "b": []} @@ -105,7 +105,7 @@ def test_partitioning_micropartitions_range_boundaries_empty(mp) -> None: ], ) def test_partitioning_micropartitions_range(mp) -> None: - boundaries = LegacyTable.from_pydict({"a": np.linspace(0, 5, 3)[1:]}).eval_expression_list( + boundaries = Table.from_pydict({"a": np.linspace(0, 5, 3)[1:]}).eval_expression_list( [col("a").cast(DataType.int64())] ) split_tables = mp.partition_by_range([col("a")], boundaries, [True]) @@ -251,7 +251,7 @@ def test_table_partition_by_range_single_column(size, k, desc) -> None: if desc: input_boundaries = input_boundaries[::-1] - boundaries = LegacyTable.from_pydict({"x": input_boundaries}).eval_expression_list( + boundaries = Table.from_pydict({"x": input_boundaries}).eval_expression_list( [col("x").cast(table.get_column("x").datatype())] ) @@ -286,7 +286,7 @@ def test_table_partition_by_range_multi_column(size, k, desc) -> None: if desc: input_boundaries = input_boundaries[::-1] - boundaries = LegacyTable.from_pydict({"x": np.ones(k - 1), "y": input_boundaries}).eval_expression_list( + boundaries = Table.from_pydict({"x": np.ones(k - 1), "y": input_boundaries}).eval_expression_list( [col("x").cast(table.get_column("x").datatype()), col("y").cast(table.get_column("y").datatype())] ) @@ -310,7 +310,7 @@ def test_table_partition_by_range_multi_column(size, k, desc) -> None: def test_table_partition_by_range_multi_column_string() -> None: table = MicroPartition.from_pydict({"x": ["a", "c", "a", "c"], "y": ["1", "2", "3", "4"]}) - boundaries = LegacyTable.from_pydict({"x": ["b"], "y": ["1"]}) + boundaries = Table.from_pydict({"x": ["b"], "y": ["1"]}) split_tables = table.partition_by_range([col("x"), col("y")], boundaries, [False, False]) assert len(split_tables) == 2 @@ -326,7 +326,7 @@ def test_table_partition_by_range_multi_column_string() -> None: def test_table_partition_by_range_input() -> None: data = {"x": [1, 2, 3], "b": [0, 1, 2]} table_cls = MicroPartition.from_pydict(data) - boundaries = LegacyTable.from_pydict(data) + boundaries = Table.from_pydict(data) with pytest.raises(ValueError, match="Schema Mismatch"): table_cls.partition_by_range([col("x")], boundaries, [False])