From e748ae607554cb86732f74cee4f640f2191b7b51 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 15 Oct 2024 16:18:46 -0700 Subject: [PATCH 1/2] [BUG] Fix write_deltalake add action file path prefix --- daft/table/table_io.py | 2 +- src/daft-parquet/src/lib.rs | 2 +- tests/io/delta_lake/test_table_write.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/daft/table/table_io.py b/daft/table/table_io.py index 0f892534d9..f846584208 100644 --- a/daft/table/table_io.py +++ b/daft/table/table_io.py @@ -674,7 +674,7 @@ def __call__(self, written_file): self.parent.add_actions.append( AddAction( - written_file.path, + written_file.path.removeprefix("/"), size, self.partition_values, int(datetime.now().timestamp() * 1000), diff --git a/src/daft-parquet/src/lib.rs b/src/daft-parquet/src/lib.rs index 039124e4a2..76fe734ee0 100644 --- a/src/daft-parquet/src/lib.rs +++ b/src/daft-parquet/src/lib.rs @@ -27,7 +27,7 @@ pub enum Error { #[snafu(display("Parquet reader timed out while trying to read: {path} with a time budget of {duration_ms} ms"))] FileReadTimeout { path: String, duration_ms: i64 }, - #[snafu(display("Internal IO Error when Opening: {path}:\nDetails:\n{source}"))] + #[snafu(display("Internal IO Error when opening: {path}:\nDetails:\n{source}"))] InternalIOError { path: String, source: std::io::Error, diff --git a/tests/io/delta_lake/test_table_write.py b/tests/io/delta_lake/test_table_write.py index 7a65d835cb..03d84571b1 100644 --- a/tests/io/delta_lake/test_table_write.py +++ b/tests/io/delta_lake/test_table_write.py @@ -300,3 +300,13 @@ def test_deltalake_write_partitioned_existing_table(tmp_path): assert result["rows"] == [1, 1] check_equal_both_daft_and_delta_rs(df1.concat(df2), path, [("int", "ascending"), ("string", "ascending")]) + + +def test_deltalake_write_roundtrip(tmp_path): + path = tmp_path / "some_table" + df = daft.from_pydict({"a": [1, 2, 3, 4]}) + df.write_deltalake(str(path)) + + read_df = daft.read_deltalake(str(path)) + assert df.schema() == read_df.schema() + assert df.to_arrow() == read_df.to_arrow() From f786e0eb14402463afe53451fb83a82a6c5d7ff0 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 15 Oct 2024 16:55:07 -0700 Subject: [PATCH 2/2] make compatible with python 3.8 --- daft/table/table_io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/daft/table/table_io.py b/daft/table/table_io.py index f846584208..f540d7ced6 100644 --- a/daft/table/table_io.py +++ b/daft/table/table_io.py @@ -672,9 +672,12 @@ def __call__(self, written_file): else: size = 0 + # remove leading slash + path = written_file.path[1:] if written_file.path.startswith("/") else written_file.path + self.parent.add_actions.append( AddAction( - written_file.path.removeprefix("/"), + path, size, self.partition_values, int(datetime.now().timestamp() * 1000),