From c563062d3e0e66ef894b9c6eb7add4b778020387 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 27 Jan 2023 09:42:29 +1300 Subject: [PATCH 01/10] Create new Apache.Arrow.Compression project --- csharp/Apache.Arrow.sln | 12 ++++ .../Apache.Arrow.Compression.csproj | 17 +++++ .../CompressionCodecFactory.cs | 7 +- .../Lz4CompressionCodec.cs | 2 +- .../ZstdCompressionCodec.cs | 2 +- .../Apache.Arrow.Compression.Tests.csproj | 32 +++++++++ .../ArrowFileReaderTests.cs | 64 ++++++++++++++++++ .../ArrowStreamReaderTests.cs | 53 +++++++++++++++ .../Resources/ipc_lz4_compression.arrow | Bin .../ipc_lz4_compression.arrow_stream | Bin .../Resources/ipc_zstd_compression.arrow | Bin .../ipc_zstd_compression.arrow_stream | Bin .../generate_resources.py | 0 .../Apache.Arrow.Tests.csproj | 14 ---- .../ArrowFileReaderTests.cs | 37 ---------- .../ArrowStreamReaderTests.cs | 26 ------- 16 files changed, 185 insertions(+), 81 deletions(-) create mode 100644 csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj rename csharp/{test/Apache.Arrow.Tests/Compression => src/Apache.Arrow.Compression}/CompressionCodecFactory.cs (84%) rename csharp/{test/Apache.Arrow.Tests/Compression => src/Apache.Arrow.Compression}/Lz4CompressionCodec.cs (97%) rename csharp/{test/Apache.Arrow.Tests/Compression => src/Apache.Arrow.Compression}/ZstdCompressionCodec.cs (97%) create mode 100644 csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj create mode 100644 csharp/test/Apache.Arrow.Compression.Tests/ArrowFileReaderTests.cs create mode 100644 csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs rename csharp/test/{Apache.Arrow.Tests => Apache.Arrow.Compression.Tests}/Resources/ipc_lz4_compression.arrow (100%) rename csharp/test/{Apache.Arrow.Tests => Apache.Arrow.Compression.Tests}/Resources/ipc_lz4_compression.arrow_stream (100%) rename csharp/test/{Apache.Arrow.Tests => Apache.Arrow.Compression.Tests}/Resources/ipc_zstd_compression.arrow (100%) rename csharp/test/{Apache.Arrow.Tests => Apache.Arrow.Compression.Tests}/Resources/ipc_zstd_compression.arrow_stream (100%) rename csharp/test/{Apache.Arrow.Tests => Apache.Arrow.Compression.Tests}/generate_resources.py (100%) diff --git a/csharp/Apache.Arrow.sln b/csharp/Apache.Arrow.sln index 873a7f5f17b53..baf4bc6129598 100644 --- a/csharp/Apache.Arrow.sln +++ b/csharp/Apache.Arrow.sln @@ -19,6 +19,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.Flight.AspNetC EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.IntegrationTest", "test\Apache.Arrow.IntegrationTest\Apache.Arrow.IntegrationTest.csproj", "{E8264B7F-B680-4A55-939B-85DB628164BB}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.Compression", "src\Apache.Arrow.Compression\Apache.Arrow.Compression.csproj", "{B62E77D2-D0B0-4C0C-BA78-1C117DE4C299}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.Compression.Tests", "test\Apache.Arrow.Compression.Tests\Apache.Arrow.Compression.Tests.csproj", "{5D7FF380-B7DF-4752-B415-7C08C70C9F06}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -57,6 +61,14 @@ Global {E8264B7F-B680-4A55-939B-85DB628164BB}.Debug|Any CPU.Build.0 = Debug|Any CPU {E8264B7F-B680-4A55-939B-85DB628164BB}.Release|Any CPU.ActiveCfg = Release|Any CPU {E8264B7F-B680-4A55-939B-85DB628164BB}.Release|Any CPU.Build.0 = Release|Any CPU + {B62E77D2-D0B0-4C0C-BA78-1C117DE4C299}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B62E77D2-D0B0-4C0C-BA78-1C117DE4C299}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B62E77D2-D0B0-4C0C-BA78-1C117DE4C299}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B62E77D2-D0B0-4C0C-BA78-1C117DE4C299}.Release|Any CPU.Build.0 = Release|Any CPU + {5D7FF380-B7DF-4752-B415-7C08C70C9F06}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {5D7FF380-B7DF-4752-B415-7C08C70C9F06}.Debug|Any CPU.Build.0 = Debug|Any CPU + {5D7FF380-B7DF-4752-B415-7C08C70C9F06}.Release|Any CPU.ActiveCfg = Release|Any CPU + {5D7FF380-B7DF-4752-B415-7C08C70C9F06}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj new file mode 100644 index 0000000000000..669cf3fef6d06 --- /dev/null +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -0,0 +1,17 @@ + + + + netstandard2.0;netstandard2.1 + + + + + + + + + + + + + diff --git a/csharp/test/Apache.Arrow.Tests/Compression/CompressionCodecFactory.cs b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs similarity index 84% rename from csharp/test/Apache.Arrow.Tests/Compression/CompressionCodecFactory.cs rename to csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs index 8b2e8e4b7177d..f9ba45afa96f5 100644 --- a/csharp/test/Apache.Arrow.Tests/Compression/CompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs @@ -16,9 +16,12 @@ using System; using Apache.Arrow.Ipc; -namespace Apache.Arrow.Tests.Compression +namespace Apache.Arrow.Compression { - internal sealed class CompressionCodecFactory : ICompressionCodecFactory + /// + /// Creates compression codec implementations for decompressing Arrow IPC data + /// + public sealed class CompressionCodecFactory : ICompressionCodecFactory { public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType) { diff --git a/csharp/test/Apache.Arrow.Tests/Compression/Lz4CompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs similarity index 97% rename from csharp/test/Apache.Arrow.Tests/Compression/Lz4CompressionCodec.cs rename to csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs index c1249b362907d..9c0a2194fa211 100644 --- a/csharp/test/Apache.Arrow.Tests/Compression/Lz4CompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs @@ -18,7 +18,7 @@ using CommunityToolkit.HighPerformance; using K4os.Compression.LZ4.Streams; -namespace Apache.Arrow.Tests.Compression +namespace Apache.Arrow.Compression { internal sealed class Lz4CompressionCodec : ICompressionCodec { diff --git a/csharp/test/Apache.Arrow.Tests/Compression/ZstdCompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs similarity index 97% rename from csharp/test/Apache.Arrow.Tests/Compression/ZstdCompressionCodec.cs rename to csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs index 0993bd489b0ec..92c2e65371612 100644 --- a/csharp/test/Apache.Arrow.Tests/Compression/ZstdCompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs @@ -17,7 +17,7 @@ using Apache.Arrow.Ipc; using ZstdSharp; -namespace Apache.Arrow.Tests.Compression +namespace Apache.Arrow.Compression { internal sealed class ZstdCompressionCodec : ICompressionCodec { diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj new file mode 100644 index 0000000000000..f7e36be4e4189 --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -0,0 +1,32 @@ + + + + net6.0 + + false + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileReaderTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileReaderTests.cs new file mode 100644 index 0000000000000..667f25f802d3e --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileReaderTests.cs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Ipc; +using System; +using System.Reflection; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowFileReaderTests + { + [Theory] + [InlineData("ipc_lz4_compression.arrow")] + [InlineData("ipc_zstd_compression.arrow")] + public void CanReadCompressedIpcFile(string fileName) + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream($"Apache.Arrow.Compression.Tests.Resources.{fileName}"); + Assert.NotNull(stream); + var codecFactory = new CompressionCodecFactory(); + using var reader = new ArrowFileReader(stream, codecFactory); + + var batch = reader.ReadNextRecordBatch(); + + var intArray = (Int32Array) batch.Column("integers"); + var floatArray = (FloatArray) batch.Column("floats"); + + const int numRows = 100; + Assert.Equal(numRows, intArray.Length); + Assert.Equal(numRows, floatArray.Length); + + for (var i = 0; i < numRows; ++i) + { + Assert.Equal(i, intArray.GetValue(i)); + Assert.True(Math.Abs(floatArray.GetValue(i).Value - 0.1f * i) < 1.0e-6); + } + } + + [Fact] + public void ErrorReadingCompressedFileWithoutCodecFactory() + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("Apache.Arrow.Compression.Tests.Resources.ipc_lz4_compression.arrow"); + Assert.NotNull(stream); + using var reader = new ArrowFileReader(stream); + + var exception = Assert.Throws(() => reader.ReadNextRecordBatch()); + Assert.Contains("no ICompressionCodecFactory has been configured", exception.Message); + } + } +} diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs new file mode 100644 index 0000000000000..1b22f0e7adf83 --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Ipc; +using System; +using System.Reflection; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowStreamReaderTests + { + [Theory] + [InlineData("ipc_lz4_compression.arrow_stream")] + [InlineData("ipc_zstd_compression.arrow_stream")] + public void CanReadCompressedIpcStream(string fileName) + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream($"Apache.Arrow.Compression.Tests.Resources.{fileName}"); + Assert.NotNull(stream); + var codecFactory = new CompressionCodecFactory(); + using var reader = new ArrowStreamReader(stream, codecFactory); + + var batch = reader.ReadNextRecordBatch(); + + var intArray = (Int32Array) batch.Column("integers"); + var floatArray = (FloatArray) batch.Column("floats"); + + const int numRows = 100; + Assert.Equal(numRows, intArray.Length); + Assert.Equal(numRows, floatArray.Length); + + for (var i = 0; i < numRows; ++i) + { + Assert.Equal(i, intArray.GetValue(i)); + Assert.True(Math.Abs(floatArray.GetValue(i).Value - 0.1f * i) < 1.0e-6); + } + } + } +} + diff --git a/csharp/test/Apache.Arrow.Tests/Resources/ipc_lz4_compression.arrow b/csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_lz4_compression.arrow similarity index 100% rename from csharp/test/Apache.Arrow.Tests/Resources/ipc_lz4_compression.arrow rename to csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_lz4_compression.arrow diff --git a/csharp/test/Apache.Arrow.Tests/Resources/ipc_lz4_compression.arrow_stream b/csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_lz4_compression.arrow_stream similarity index 100% rename from csharp/test/Apache.Arrow.Tests/Resources/ipc_lz4_compression.arrow_stream rename to csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_lz4_compression.arrow_stream diff --git a/csharp/test/Apache.Arrow.Tests/Resources/ipc_zstd_compression.arrow b/csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_zstd_compression.arrow similarity index 100% rename from csharp/test/Apache.Arrow.Tests/Resources/ipc_zstd_compression.arrow rename to csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_zstd_compression.arrow diff --git a/csharp/test/Apache.Arrow.Tests/Resources/ipc_zstd_compression.arrow_stream b/csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_zstd_compression.arrow_stream similarity index 100% rename from csharp/test/Apache.Arrow.Tests/Resources/ipc_zstd_compression.arrow_stream rename to csharp/test/Apache.Arrow.Compression.Tests/Resources/ipc_zstd_compression.arrow_stream diff --git a/csharp/test/Apache.Arrow.Tests/generate_resources.py b/csharp/test/Apache.Arrow.Compression.Tests/generate_resources.py similarity index 100% rename from csharp/test/Apache.Arrow.Tests/generate_resources.py rename to csharp/test/Apache.Arrow.Compression.Tests/generate_resources.py diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index f41261767c0ed..b0de6df148ba6 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -13,24 +13,10 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - - - - \ No newline at end of file diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileReaderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileReaderTests.cs index 55f6d9904f2b1..2f2229ded4c46 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileReaderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileReaderTests.cs @@ -168,42 +168,5 @@ public void TestRecordBatchBasics() recordBatch.Dispose(); } - - [Theory] - [InlineData("ipc_lz4_compression.arrow")] - [InlineData("ipc_zstd_compression.arrow")] - public void CanReadCompressedIpcFile(string fileName) - { - var assembly = Assembly.GetExecutingAssembly(); - using var stream = assembly.GetManifestResourceStream($"Apache.Arrow.Tests.Resources.{fileName}"); - var codecFactory = new Compression.CompressionCodecFactory(); - using var reader = new ArrowFileReader(stream, codecFactory); - - var batch = reader.ReadNextRecordBatch(); - - var intArray = (Int32Array) batch.Column("integers"); - var floatArray = (FloatArray) batch.Column("floats"); - - const int numRows = 100; - Assert.Equal(numRows, intArray.Length); - Assert.Equal(numRows, floatArray.Length); - - for (var i = 0; i < numRows; ++i) - { - Assert.Equal(i, intArray.GetValue(i)); - Assert.True(Math.Abs(floatArray.GetValue(i).Value - 0.1f * i) < 1.0e-6); - } - } - - [Fact] - public void ErrorReadingCompressedFileWithoutCodecFactory() - { - var assembly = Assembly.GetExecutingAssembly(); - using var stream = assembly.GetManifestResourceStream("Apache.Arrow.Tests.Resources.ipc_lz4_compression.arrow"); - using var reader = new ArrowFileReader(stream); - - var exception = Assert.Throws(() => reader.ReadNextRecordBatch()); - Assert.Contains("no ICompressionCodecFactory has been configured", exception.Message); - } } } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs index 1eadcd00fa80e..0e8c9d6687a02 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs @@ -195,32 +195,6 @@ public async Task ReadRecordBatchAsync_PartialReadStream(bool createDictionaryAr await TestReaderFromPartialReadStream(ArrowReaderVerifier.VerifyReaderAsync, createDictionaryArray); } - [Theory] - [InlineData("ipc_lz4_compression.arrow_stream")] - [InlineData("ipc_zstd_compression.arrow_stream")] - public void CanReadCompressedIpcStream(string fileName) - { - var assembly = Assembly.GetExecutingAssembly(); - using var stream = assembly.GetManifestResourceStream($"Apache.Arrow.Tests.Resources.{fileName}"); - var codecFactory = new Compression.CompressionCodecFactory(); - using var reader = new ArrowStreamReader(stream, codecFactory); - - var batch = reader.ReadNextRecordBatch(); - - var intArray = (Int32Array) batch.Column("integers"); - var floatArray = (FloatArray) batch.Column("floats"); - - const int numRows = 100; - Assert.Equal(numRows, intArray.Length); - Assert.Equal(numRows, floatArray.Length); - - for (var i = 0; i < numRows; ++i) - { - Assert.Equal(i, intArray.GetValue(i)); - Assert.True(Math.Abs(floatArray.GetValue(i).Value - 0.1f * i) < 1.0e-6); - } - } - /// /// Verifies that the stream reader reads multiple times when a stream /// only returns a subset of the data from each Read. From 0621d6a1338f97473c22b2838228e2611edac2f7 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 27 Jan 2023 10:12:32 +1300 Subject: [PATCH 02/10] Configure release scripts for new Apache.Arrow.Compression package --- dev/release/post-06-csharp.sh | 1 + dev/release/rat_exclude_files.txt | 2 ++ dev/tasks/tasks.yml | 2 ++ 3 files changed, 5 insertions(+) diff --git a/dev/release/post-06-csharp.sh b/dev/release/post-06-csharp.sh index d2968a5d524de..8c86b36774887 100755 --- a/dev/release/post-06-csharp.sh +++ b/dev/release/post-06-csharp.sh @@ -39,6 +39,7 @@ base_names=() base_names+=(Apache.Arrow.${version}) base_names+=(Apache.Arrow.Flight.${version}) base_names+=(Apache.Arrow.Flight.AspNetCore.${version}) +base_names+=(Apache.Arrow.Compression.${version}) for base_name in ${base_names[@]}; do for extension in nupkg snupkg; do path=${base_name}.${extension} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 5b7428ef8b82a..b8594e3543bc7 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -181,11 +181,13 @@ csharp/src/Apache.Arrow/Properties/Resources.Designer.cs csharp/src/Apache.Arrow/Properties/Resources.resx csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj +csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj csharp/test/Apache.Arrow.Tests/app.config *.html *.sgml diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 8459fa381f293..18a88ea53e6ba 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -976,6 +976,8 @@ tasks: - Apache.Arrow.Flight.AspNetCore.{no_rc_version}.snupkg - Apache.Arrow.Flight.{no_rc_version}.nupkg - Apache.Arrow.Flight.{no_rc_version}.snupkg + - Apache.Arrow.Compression.{no_rc_version}.nupkg + - Apache.Arrow.Compression.{no_rc_version}.snupkg - Apache.Arrow.{no_rc_version}.nupkg - Apache.Arrow.{no_rc_version}.snupkg From 72f1c16f0e28a012ba2bb7cc57842a7ae39077d5 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 27 Jan 2023 10:24:40 +1300 Subject: [PATCH 03/10] Documentation update --- csharp/README.md | 10 ++++++++++ docs/source/status.rst | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/csharp/README.md b/csharp/README.md index 3d0681279a324..708c7535c9cdb 100644 --- a/csharp/README.md +++ b/csharp/README.md @@ -107,6 +107,16 @@ for currently available features. - File - Stream +## IPC Format + +### Compression + +- Buffer compression is not supported when writing IPC files or streams +- Buffer decompression is supported, but requires installing the `Apache.Arrow.Compression` package, + and passing an `Apache.Arrow.Compression.CompressionCodecFactory` instance to the + `ArrowFileReader` or `ArrowStreamReader` constructor. + Alternatively, a custom implementation of `ICompressionCodecFactory` can be used. + ## Not Implemented - Serialization diff --git a/docs/source/status.rst b/docs/source/status.rst index fc63787225596..7e77e8e79fbeb 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -128,7 +128,7 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ | Sparse tensors | ✓ | | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ -| Buffer compression | ✓ | ✓ (3) | ✓ | | | ✓ | ✓ | +| Buffer compression | ✓ | ✓ (3) | ✓ | | ✓ (4) | ✓ | ✓ | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ | Endianness conversion | ✓ (2) | | ✓ (2) | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ @@ -143,6 +143,8 @@ Notes: * \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance. +* \(4) Compression when writing is not supported, only decompression when reading. + .. seealso:: The :ref:`format-ipc` specification. From 6b5145e38f32868437b1cbb71479d9138c4aa672 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 27 Jan 2023 11:18:05 +1300 Subject: [PATCH 04/10] Add package description --- .../src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 1 + 1 file changed, 1 insertion(+) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 669cf3fef6d06..69b50e8e13970 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -2,6 +2,7 @@ netstandard2.0;netstandard2.1 + Provides decompression support for the Arrow IPC format From 16e944f56d0f1ad726b83df2e92964a7e388a0a6 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 30 Jan 2023 13:49:06 +1300 Subject: [PATCH 05/10] Add test for reading compressed stream without a compression codec factory --- .../ArrowStreamReaderTests.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs index 1b22f0e7adf83..1be429a63b6a6 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamReaderTests.cs @@ -48,6 +48,18 @@ public void CanReadCompressedIpcStream(string fileName) Assert.True(Math.Abs(floatArray.GetValue(i).Value - 0.1f * i) < 1.0e-6); } } + + [Fact] + public void ErrorReadingCompressedStreamWithoutCodecFactory() + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("Apache.Arrow.Compression.Tests.Resources.ipc_lz4_compression.arrow_stream"); + Assert.NotNull(stream); + using var reader = new ArrowStreamReader(stream); + + var exception = Assert.Throws(() => reader.ReadNextRecordBatch()); + Assert.Contains("no ICompressionCodecFactory has been configured", exception.Message); + } } } From 177dc43217ec136aa165821b05fd5d0e52cbe5a4 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 27 Feb 2023 09:55:47 +1300 Subject: [PATCH 06/10] Target netstandard2.0 only Co-authored-by: Eric Erhardt --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 69b50e8e13970..d5a268e6070b7 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -1,7 +1,7 @@ - netstandard2.0;netstandard2.1 + netstandard2.0 Provides decompression support for the Arrow IPC format From 77f5feb8746a3855f330d46662d7f96e102e4e9f Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 27 Feb 2023 10:32:43 +1300 Subject: [PATCH 07/10] Use singletons for NoOpBufferCreator and LZ4CompressionCodec --- .../src/Apache.Arrow.Compression/CompressionCodecFactory.cs | 2 +- csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs | 5 +++++ csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs | 2 +- csharp/src/Apache.Arrow/Ipc/NoOpBufferCreator.cs | 5 +++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs index f9ba45afa96f5..3e0a537a89a8f 100644 --- a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs @@ -27,7 +27,7 @@ public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType) { return compressionCodecType switch { - CompressionCodecType.Lz4Frame => new Lz4CompressionCodec(), + CompressionCodecType.Lz4Frame => Lz4CompressionCodec.Instance, CompressionCodecType.Zstd => new ZstdCompressionCodec(), _ => throw new NotImplementedException($"Compression type {compressionCodecType} is not supported") }; diff --git a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs index 9c0a2194fa211..3c634ca7d45c5 100644 --- a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs @@ -22,6 +22,11 @@ namespace Apache.Arrow.Compression { internal sealed class Lz4CompressionCodec : ICompressionCodec { + /// + /// Singleton instance, used as this class doesn't need to be disposed and has no state + /// + public static readonly Lz4CompressionCodec Instance = new Lz4CompressionCodec(); + public int Decompress(ReadOnlyMemory source, Memory destination) { using var sourceStream = source.AsStream(); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index 4f87d88f612f1..a1c1430124013 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -200,7 +200,7 @@ private IBufferCreator GetBufferCreator(BodyCompression? compression) { if (!compression.HasValue) { - return new NoOpBufferCreator(); + return NoOpBufferCreator.Instance; } var method = compression.Value.Method; diff --git a/csharp/src/Apache.Arrow/Ipc/NoOpBufferCreator.cs b/csharp/src/Apache.Arrow/Ipc/NoOpBufferCreator.cs index 8681aea4c9cf4..13c22c7fdbcba 100644 --- a/csharp/src/Apache.Arrow/Ipc/NoOpBufferCreator.cs +++ b/csharp/src/Apache.Arrow/Ipc/NoOpBufferCreator.cs @@ -22,6 +22,11 @@ namespace Apache.Arrow.Ipc /// internal sealed class NoOpBufferCreator : IBufferCreator { + /// + /// Singleton instance, used as this class doesn't need to be disposed and has no state + /// + public static readonly NoOpBufferCreator Instance = new NoOpBufferCreator(); + public ArrowBuffer CreateBuffer(ReadOnlyMemory source) { return new ArrowBuffer(source); From a80c8558276fb7fe9665871d762bf28ffe6de9eb Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 27 Feb 2023 15:52:47 +1300 Subject: [PATCH 08/10] Use LZ4Frame API to avoid wrapping memory as streams --- .../Apache.Arrow.Compression.csproj | 1 - .../src/Apache.Arrow.Compression/Lz4CompressionCodec.cs | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index d5a268e6070b7..7795d24778985 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -6,7 +6,6 @@ - diff --git a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs index 3c634ca7d45c5..ebbcfbc3e095f 100644 --- a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs @@ -15,7 +15,6 @@ using System; using Apache.Arrow.Ipc; -using CommunityToolkit.HighPerformance; using K4os.Compression.LZ4.Streams; namespace Apache.Arrow.Compression @@ -29,11 +28,8 @@ internal sealed class Lz4CompressionCodec : ICompressionCodec public int Decompress(ReadOnlyMemory source, Memory destination) { - using var sourceStream = source.AsStream(); - using var destStream = destination.AsStream(); - using var decompressedStream = LZ4Stream.Decode(sourceStream); - decompressedStream.CopyTo(destStream); - return (int) destStream.Length; + using var decoder = LZ4Frame.Decode(source); + return decoder.ReadManyBytes(destination.Span); } public void Dispose() From fccfbe0630db8a7b10d11c3d22b60b0c25f2d110 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 28 Feb 2023 08:39:48 +1300 Subject: [PATCH 09/10] Remove unused coverlet.collector package reference Co-authored-by: Eric Erhardt --- .../Apache.Arrow.Compression.Tests.csproj | 1 - 1 file changed, 1 deletion(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 92cbb86a3c9d3..9ff4f235b83a5 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -10,7 +10,6 @@ - From 67987216b576e3442b34e991327129b37428f2c4 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 28 Feb 2023 08:42:54 +1300 Subject: [PATCH 10/10] Remove unnecessary "None Remove" items from csproj Co-authored-by: Eric Erhardt --- .../Apache.Arrow.Compression.Tests.csproj | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 9ff4f235b83a5..7c1f3a77a4c18 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -18,13 +18,9 @@ - - - -