diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index fc8d0bad58e9f..5451cbe064b59 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -108,7 +108,8 @@ jobs: restore-keys: ${{ matrix.image }}- - name: Setup Python run: | - sudo apt install -y --no-install-recommends python3 python3-pip + sudo apt update + sudo apt install -y --no-install-recommends python3 python3-dev python3-pip - name: Setup Archery run: python3 -m pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 3c695891b48d6..a0dfb9fea1673 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -79,7 +79,8 @@ jobs: submodules: recursive - name: Setup Python run: | - sudo apt install -y --no-install-recommends python3 python3-pip + sudo apt update + sudo apt install -y --no-install-recommends python3 python3-dev python3-pip - name: Setup Archery run: python3 -m pip install -e dev/archery[docker] - name: Execute Docker Build @@ -232,7 +233,7 @@ jobs: name: AMD64 Windows 2019 Go ${{ matrix.go }} runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 + timeout-minutes: 25 strategy: fail-fast: false matrix: diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 4440d36d18f73..41843d663051a 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -73,12 +73,6 @@ jobs: fi echo $PREFIX archery crossbow download-artifacts -f java-jars -t binaries $PREFIX - - name: Cache Repo - uses: actions/cache@v3 - with: - path: repo - key: java-nightly-${{ github.run_id }} - restore-keys: java-nightly - name: Sync from Remote uses: ./arrow/.github/actions/sync-nightlies with: diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index bd08937ae81be..af1bfe9b780f4 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme==0.8 +pydata-sphinx-theme sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index a306790b5cb6d..074021677d6fd 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -29,9 +29,11 @@ ARG go=1.19.13 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ +# Pin Python until pythonnet is made compatible with 3.12 +# (https://github.com/pythonnet/pythonnet/pull/2249) RUN mamba install -q -y \ --file arrow/ci/conda_env_archery.txt \ - "python>=3.7" \ + "python < 3.12" \ numpy \ compilers \ maven=${maven} \ diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index 3c8cc0f4ee2e2..2a38901337c56 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -41,3 +41,22 @@ pushd ${source_dir}/parquet go install -v ./... popd + +if [[ -n "${ARROW_GO_INTEGRATION}" ]]; then + pushd ${source_dir}/arrow/internal/cdata_integration + + case "$(uname)" in + Linux) + go_lib="arrow_go_integration.so" + ;; + Darwin) + go_lib="arrow_go_integration.so" + ;; + MINGW*) + go_lib="arrow_go_integration.dll" + ;; + esac + go build -tags cdata_integration,assert -buildmode=c-shared -o ${go_lib} . + + popd +fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a165f8027bf8f..289d376a4db9b 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -23,6 +23,11 @@ arrow_dir=${1} gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration pip install -e $arrow_dir/dev/archery[integration] +# For C# C Data Interface testing +pip install pythonnet + +# Get more detailed context on crashes +export PYTHONFAULTHANDLER=1 # Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 time archery integration \ diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index c97733257a721..d61f74f0b7ca1 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -32,12 +32,14 @@ yarn lint:ci yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then - if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then - yarn doc - elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then - yarn doc --gitRemote upstream - elif [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + # If apache or upstream are defined use those as remote. + # Otherwise use origin which could be a fork on PRs. + if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache + elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + yarn doc --gitRemote upstream + elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f2906b960eba6..f0acab0389b19 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -152,6 +152,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "17.0" "16.0" "15.0" "14.0" diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index e980f53fd3407..69f680824b082 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -86,16 +86,20 @@ if(LLVM_FOUND) target_link_libraries(LLVM::LLVM_LIBS INTERFACE LLVM) else() # Find the libraries that correspond to the LLVM components - llvm_map_components_to_libnames(LLVM_LIBS - core - mcjit - native - ipo - bitreader - target - linker - analysis - debuginfodwarf) + set(LLVM_TARGET_COMPONENTS + analysis + bitreader + core + debuginfodwarf + ipo + linker + mcjit + native + target) + if(LLVM_VERSION_MAJOR GREATER_EQUAL 14) + list(APPEND LLVM_TARGET_COMPONENTS passes) + endif() + llvm_map_components_to_libnames(LLVM_LIBS ${LLVM_TARGET_COMPONENTS}) target_link_libraries(LLVM::LLVM_LIBS INTERFACE ${LLVM_LIBS}) if(TARGET LLVMSupport AND NOT ARROW_ZSTD_USE_SHARED) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index df60074c78470..d8308c824953a 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -627,6 +627,22 @@ std::shared_ptr StructArray::GetFieldByName(const std::string& name) cons return i == -1 ? nullptr : field(i); } +Status StructArray::CanReferenceFieldByName(const std::string& name) const { + if (GetFieldByName(name) == nullptr) { + return Status::Invalid("Field named '", name, + "' not found or not unique in the struct."); + } + return Status::OK(); +} + +Status StructArray::CanReferenceFieldsByNames( + const std::vector& names) const { + for (const auto& name : names) { + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); + } + return Status::OK(); +} + Result StructArray::Flatten(MemoryPool* pool) const { ArrayVector flattened; flattened.resize(data_->child_data.size()); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 47c1db039ccc9..8d5cc95fec00d 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -404,6 +404,12 @@ class ARROW_EXPORT StructArray : public Array { /// Returns null if name not found std::shared_ptr GetFieldByName(const std::string& name) const; + /// Indicate if field named `name` can be found unambiguously in the struct. + Status CanReferenceFieldByName(const std::string& name) const; + + /// Indicate if fields named `names` can be found unambiguously in the struct. + Status CanReferenceFieldsByNames(const std::vector& names) const; + /// \brief Flatten this array as a vector of arrays, one for each field /// /// \param[in] pool The pool to allocate null bitmaps from, if necessary diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 318c83860e009..73d53a7efa59b 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -303,6 +303,58 @@ TEST(StructArray, FlattenOfSlice) { ASSERT_OK(arr->ValidateFull()); } +TEST(StructArray, CanReferenceFieldByName) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldByName("f0")); + ASSERT_OK(arr->CanReferenceFieldByName("f2")); + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("nope")); + + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("f1")); +} + +TEST(StructArray, CanReferenceFieldsByNames) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldsByNames({"f0", "f2"})); + ASSERT_OK(arr->CanReferenceFieldsByNames({"f2", "f0"})); + + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"nope"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "nope"})); + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f1"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1"})); + // Both + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1", "nope"})); +} + // ---------------------------------------------------------------------------------- // Struct test class TestStructBuilder : public ::testing::Test { diff --git a/cpp/src/arrow/arrow-config.cmake b/cpp/src/arrow/arrow-config.cmake index 8c9173c1710cb..c18c9eff37279 100644 --- a/cpp/src/arrow/arrow-config.cmake +++ b/cpp/src/arrow/arrow-config.cmake @@ -19,8 +19,7 @@ message(WARNING "find_package(arrow) is deprecated. Use find_package(Arrow) inst find_package(Arrow CONFIG) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(arrow - REQUIRED_VARS - ARROW_INCLUDE_DIR - VERSION_VAR - ARROW_VERSION) +find_package_handle_standard_args( + arrow + REQUIRED_VARS ARROW_INCLUDE_DIR + VERSION_VAR ARROW_VERSION) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 8e2669bd3dfb9..00a833742f957 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -251,6 +251,7 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { bool saw_date32 = false; bool saw_date64 = false; bool saw_duration = false; + bool saw_time_since_midnight = false; const TypeHolder* end = begin + count; for (auto it = begin; it != end; it++) { auto id = it->type->id(); @@ -271,6 +272,18 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { finest_unit = std::max(finest_unit, ty.unit()); continue; } + case Type::TIME32: { + const auto& type = checked_cast(*it->type); + finest_unit = std::max(finest_unit, type.unit()); + saw_time_since_midnight = true; + continue; + } + case Type::TIME64: { + const auto& type = checked_cast(*it->type); + finest_unit = std::max(finest_unit, type.unit()); + saw_time_since_midnight = true; + continue; + } case Type::DURATION: { const auto& ty = checked_cast(*it->type); finest_unit = std::max(finest_unit, ty.unit()); @@ -282,15 +295,33 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { } } - if (timezone) { - // At least one timestamp seen - return timestamp(finest_unit, *timezone); - } else if (saw_date64) { - return date64(); - } else if (saw_date32) { - return date32(); - } else if (saw_duration) { - return duration(finest_unit); + bool saw_timestamp_or_date = timezone || saw_date64 || saw_date32 || saw_duration; + + if (saw_time_since_midnight && saw_timestamp_or_date) { + // Cannot find common type + return TypeHolder(nullptr); + } + if (saw_timestamp_or_date) { + if (timezone) { + // At least one timestamp seen + return timestamp(finest_unit, *timezone); + } else if (saw_date64) { + return date64(); + } else if (saw_date32) { + return date32(); + } else if (saw_duration) { + return duration(finest_unit); + } + } + if (saw_time_since_midnight) { + switch (finest_unit) { + case TimeUnit::SECOND: + case TimeUnit::MILLI: + return time32(finest_unit); + case TimeUnit::MICRO: + case TimeUnit::NANO: + return time64(finest_unit); + } } return TypeHolder(nullptr); } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc index af024fb8d6e08..6bb5568d2ff38 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc @@ -159,6 +159,18 @@ TEST(TestDispatchBest, CommonTemporal) { args = {timestamp(TimeUnit::SECOND, "America/Phoenix"), timestamp(TimeUnit::SECOND, "UTC")}; ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); + + args = {time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)}; + AssertTypeEqual(*time32(TimeUnit::MILLI), *CommonTemporal(args.data(), args.size())); + + args = {time32(TimeUnit::SECOND), time64(TimeUnit::NANO)}; + AssertTypeEqual(*time64(TimeUnit::NANO), *CommonTemporal(args.data(), args.size())); + + args = {date32(), time32(TimeUnit::SECOND)}; + ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); + + args = {timestamp(TimeUnit::SECOND), time32(TimeUnit::SECOND)}; + ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); } TEST(TestDispatchBest, CommonTemporalResolution) { diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 2fcd57d2f3622..6a97b51cf2815 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -81,6 +81,20 @@ Result> FileSource::Open() const { return custom_open_(); } +Future> FileSource::OpenAsync() const { + if (filesystem_) { + return filesystem_->OpenInputFileAsync(file_info_); + } + + if (buffer_) { + return Future>::MakeFinished( + std::make_shared(buffer_)); + } + + // TODO(GH-37962): custom_open_ should not block + return Future>::MakeFinished(custom_open_()); +} + int64_t FileSource::Size() const { if (filesystem_) { return file_info_.size(); diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index d33d88e9966fe..46fc8ebc40db0 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -115,6 +115,7 @@ class ARROW_DS_EXPORT FileSource : public util::EqualityComparable { /// \brief Get a RandomAccessFile which views this file source Result> Open() const; + Future> OpenAsync() const; /// \brief Get the size (in bytes) of the file or buffer /// If the file is compressed this should be the compressed (on-disk) size. diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 751937e93b937..3cad1ddd8321f 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -479,29 +479,35 @@ Future> ParquetFileFormat::GetReader default_fragment_scan_options)); auto properties = MakeReaderProperties(*this, parquet_scan_options.get(), options->pool); - ARROW_ASSIGN_OR_RAISE(auto input, source.Open()); - // TODO(ARROW-12259): workaround since we have Future<(move-only type)> - auto reader_fut = parquet::ParquetFileReader::OpenAsync( - std::move(input), std::move(properties), metadata); - auto path = source.path(); + auto self = checked_pointer_cast(shared_from_this()); - return reader_fut.Then( - [=](const std::unique_ptr&) mutable - -> Result> { - ARROW_ASSIGN_OR_RAISE(std::unique_ptr reader, - reader_fut.MoveResult()); - std::shared_ptr metadata = reader->metadata(); - auto arrow_properties = - MakeArrowReaderProperties(*this, *metadata, *options, *parquet_scan_options); - std::unique_ptr arrow_reader; - RETURN_NOT_OK(parquet::arrow::FileReader::Make(options->pool, std::move(reader), - std::move(arrow_properties), - &arrow_reader)); - return std::move(arrow_reader); - }, - [path]( - const Status& status) -> Result> { - return WrapSourceError(status, path); + + return source.OpenAsync().Then( + [=](const std::shared_ptr& input) mutable { + return parquet::ParquetFileReader::OpenAsync(input, std::move(properties), + metadata) + .Then( + [=](const std::unique_ptr& reader) mutable + -> Result> { + auto arrow_properties = MakeArrowReaderProperties( + *self, *reader->metadata(), *options, *parquet_scan_options); + + std::unique_ptr arrow_reader; + RETURN_NOT_OK(parquet::arrow::FileReader::Make( + options->pool, + // TODO(ARROW-12259): workaround since we have Future<(move-only + // type)> It *wouldn't* be safe to const_cast reader except that + // here we know there are no other waiters on the reader. + std::move(const_cast&>( + reader)), + std::move(arrow_properties), &arrow_reader)); + + return std::move(arrow_reader); + }, + [path = source.path()](const Status& status) + -> Result> { + return WrapSourceError(status, path); + }); }); } diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 177ca824179a8..dc9e085df3c4c 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -730,6 +730,31 @@ TEST_P(TestParquetFileFormatScan, PredicatePushdownRowGroupFragmentsUsingDuratio CountRowGroupsInFragment(fragment, {0}, expr); } +TEST_P(TestParquetFileFormatScan, + PredicatePushdownRowGroupFragmentsUsingTimestampColumn) { + // GH-37799: Parquet arrow will change TimeUnit::SECOND to TimeUnit::MILLI + // because parquet LogicalType doesn't support SECOND. + for (auto time_unit : {TimeUnit::MILLI, TimeUnit::SECOND}) { + auto table = TableFromJSON(schema({field("t", time32(time_unit))}), + { + R"([{"t": 1}])", + R"([{"t": 2}, {"t": 3}])", + }); + TableBatchReader table_reader(*table); + ARROW_SCOPED_TRACE("time_unit=", time_unit); + ASSERT_OK_AND_ASSIGN( + auto source, + ParquetFormatHelper::Write( + &table_reader, ArrowWriterProperties::Builder().store_schema()->build()) + .As()); + SetSchema({field("t", time32(time_unit))}); + ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source)); + + auto expr = equal(field_ref("t"), literal(::arrow::Time32Scalar(1, time_unit))); + CountRowGroupsInFragment(fragment, {0}, expr); + } +} + // Tests projection with nested/indexed FieldRefs. // https://github.com/apache/arrow/issues/35579 TEST_P(TestParquetFileFormatScan, ProjectWithNonNamedFieldRefs) { diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index b32036a2c973f..5feb310fc14a2 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -191,33 +191,33 @@ TEST(FlightTypes, FlightEndpoint) { Timestamp expiration_time( std::chrono::duration_cast(expiration_time_duration)); std::vector values = { - {{""}, {}, std::nullopt}, - {{"foo"}, {}, std::nullopt}, - {{"bar"}, {}, std::nullopt}, - {{"foo"}, {}, expiration_time}, - {{"foo"}, {location1}, std::nullopt}, - {{"bar"}, {location1}, std::nullopt}, - {{"foo"}, {location2}, std::nullopt}, - {{"foo"}, {location1, location2}, std::nullopt}, + {{""}, {}, std::nullopt, {}}, + {{"foo"}, {}, std::nullopt, {}}, + {{"bar"}, {}, std::nullopt, {"\xDE\xAD\xBE\xEF"}}, + {{"foo"}, {}, expiration_time, {}}, + {{"foo"}, {location1}, std::nullopt, {}}, + {{"bar"}, {location1}, std::nullopt, {}}, + {{"foo"}, {location2}, std::nullopt, {}}, + {{"foo"}, {location1, location2}, std::nullopt, {"\xba\xdd\xca\xfe"}}, }; std::vector reprs = { " locations=[] " - "expiration_time=null>", + "expiration_time=null app_metadata=''>", " locations=[] " - "expiration_time=null>", + "expiration_time=null app_metadata=''>", " locations=[] " - "expiration_time=null>", + "expiration_time=null app_metadata='DEADBEEF'>", " locations=[] " - "expiration_time=2023-06-19 03:14:06.004339000>", + "expiration_time=2023-06-19 03:14:06.004339000 app_metadata=''>", " locations=" - "[grpc+tcp://localhost:1024] expiration_time=null>", + "[grpc+tcp://localhost:1024] expiration_time=null app_metadata=''>", " locations=" - "[grpc+tcp://localhost:1024] expiration_time=null>", + "[grpc+tcp://localhost:1024] expiration_time=null app_metadata=''>", " locations=" - "[grpc+tls://localhost:1024] expiration_time=null>", + "[grpc+tls://localhost:1024] expiration_time=null app_metadata=''>", " locations=" "[grpc+tcp://localhost:1024, grpc+tls://localhost:1024] " - "expiration_time=null>", + "expiration_time=null app_metadata='BADDCAFE'>", }; ASSERT_NO_FATAL_FAILURE(TestRoundtrip(values, reprs)); @@ -229,30 +229,35 @@ TEST(FlightTypes, FlightInfo) { Schema schema2({}); auto desc1 = FlightDescriptor::Command("foo"); auto desc2 = FlightDescriptor::Command("bar"); - auto endpoint1 = FlightEndpoint{Ticket{"foo"}, {}, std::nullopt}; - auto endpoint2 = FlightEndpoint{Ticket{"foo"}, {location}, std::nullopt}; + auto endpoint1 = FlightEndpoint{Ticket{"foo"}, {}, std::nullopt, ""}; + auto endpoint2 = + FlightEndpoint{Ticket{"foo"}, {location}, std::nullopt, "\xCA\xFE\xD0\x0D"}; std::vector values = { - MakeFlightInfo(schema1, desc1, {}, -1, -1, false), - MakeFlightInfo(schema1, desc2, {}, -1, -1, true), - MakeFlightInfo(schema2, desc1, {}, -1, -1, false), - MakeFlightInfo(schema1, desc1, {endpoint1}, -1, 42, true), - MakeFlightInfo(schema1, desc2, {endpoint1, endpoint2}, 64, -1, false), + MakeFlightInfo(schema1, desc1, {}, -1, -1, false, ""), + MakeFlightInfo(schema1, desc2, {}, -1, -1, true, ""), + MakeFlightInfo(schema2, desc1, {}, -1, -1, false, ""), + MakeFlightInfo(schema1, desc1, {endpoint1}, -1, 42, true, ""), + MakeFlightInfo(schema1, desc2, {endpoint1, endpoint2}, 64, -1, false, + "\xDE\xAD\xC0\xDE"), }; std::vector reprs = { " " - "endpoints=[] total_records=-1 total_bytes=-1 ordered=false>", + "endpoints=[] total_records=-1 total_bytes=-1 ordered=false app_metadata=''>", " " - "endpoints=[] total_records=-1 total_bytes=-1 ordered=true>", + "endpoints=[] total_records=-1 total_bytes=-1 ordered=true app_metadata=''>", " " - "endpoints=[] total_records=-1 total_bytes=-1 ordered=false>", + "endpoints=[] total_records=-1 total_bytes=-1 ordered=false app_metadata=''>", " " "endpoints=[ locations=[] " - "expiration_time=null>] total_records=-1 total_bytes=42 ordered=true>", + "expiration_time=null app_metadata=''>] total_records=-1 total_bytes=42 " + "ordered=true app_metadata=''>", " " "endpoints=[ locations=[] " - "expiration_time=null>, " - "locations=[grpc+tcp://localhost:1234] expiration_time=null>] " - "total_records=64 total_bytes=-1 ordered=false>", + "expiration_time=null app_metadata=''>, " + "locations=[grpc+tcp://localhost:1234] expiration_time=null " + "app_metadata='CAFED00D'>] " + "total_records=64 total_bytes=-1 ordered=false app_metadata='DEADC0DE'>", }; ASSERT_NO_FATAL_FAILURE(TestRoundtrip(values, reprs)); @@ -262,8 +267,8 @@ TEST(FlightTypes, PollInfo) { ASSERT_OK_AND_ASSIGN(auto location, Location::ForGrpcTcp("localhost", 1234)); Schema schema({field("ints", int64())}); auto desc = FlightDescriptor::Command("foo"); - auto endpoint = FlightEndpoint{Ticket{"foo"}, {}, std::nullopt}; - auto info = MakeFlightInfo(schema, desc, {endpoint}, -1, 42, true); + auto endpoint = FlightEndpoint{Ticket{"foo"}, {}, std::nullopt, ""}; + auto info = MakeFlightInfo(schema, desc, {endpoint}, -1, 42, true, ""); // 2023-06-19 03:14:06.004330100 // We must use microsecond resolution here for portability. // std::chrono::system_clock::time_point may not provide nanosecond diff --git a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc index 4a49dea31b99d..67c7ee85f59d3 100644 --- a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc +++ b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc @@ -73,6 +73,10 @@ TEST(FlightIntegration, ExpirationTimeRenewFlightEndpoint) { TEST(FlightIntegration, PollFlightInfo) { ASSERT_OK(RunScenario("poll_flight_info")); } +TEST(FlightIntegration, AppMetadataFlightInfoEndpoint) { + ASSERT_OK(RunScenario("app_metadata_flight_info_endpoint")); +} + TEST(FlightIntegration, FlightSql) { ASSERT_OK(RunScenario("flight_sql")); } TEST(FlightIntegration, FlightSqlExtension) { diff --git a/cpp/src/arrow/flight/integration_tests/test_integration.cc b/cpp/src/arrow/flight/integration_tests/test_integration.cc index 03e352ffa6770..31bffd7704474 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration.cc @@ -815,6 +815,64 @@ class PollFlightInfoScenario : public Scenario { } }; +/// \brief The server used for testing app_metadata in FlightInfo and FlightEndpoint +class AppMetadataFlightInfoEndpointServer : public FlightServerBase { + public: + AppMetadataFlightInfoEndpointServer() : FlightServerBase() {} + + Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request, + std::unique_ptr* info) override { + if (request.type != FlightDescriptor::CMD) { + return Status::Invalid("request descriptor should be of type CMD"); + } + + auto schema = arrow::schema({arrow::field("number", arrow::uint32(), false)}); + std::vector endpoints = { + FlightEndpoint{{}, {}, std::nullopt, request.cmd}}; + ARROW_ASSIGN_OR_RAISE(auto result, FlightInfo::Make(*schema, request, endpoints, -1, + -1, false, request.cmd)); + *info = std::make_unique(std::move(result)); + return Status::OK(); + } +}; + +/// \brief The AppMetadataFlightInfoEndpoint scenario. +/// +/// This tests that the client can receive and use the `app_metadata` field in +/// the FlightInfo and FlightEndpoint messages. +/// +/// The server only implements GetFlightInfo and will return a FlightInfo with a non- +/// empty app_metadata value that should match the app_metadata field in the +/// included FlightEndpoint. The value should be the same as the cmd bytes passed +/// in the call to GetFlightInfo by the client. +class AppMetadataFlightInfoEndpointScenario : public Scenario { + Status MakeServer(std::unique_ptr* server, + FlightServerOptions* options) override { + *server = std::make_unique(); + return Status::OK(); + } + + Status MakeClient(FlightClientOptions* options) override { return Status::OK(); } + + Status RunClient(std::unique_ptr client) override { + ARROW_ASSIGN_OR_RAISE(auto info, + client->GetFlightInfo(FlightDescriptor::Command("foobar"))); + if (info->app_metadata() != "foobar") { + return Status::Invalid("app_metadata should have been 'foobar', got: ", + info->app_metadata()); + } + if (info->endpoints().size() != 1) { + return Status::Invalid("should have gotten exactly one FlightEndpoint back, got: ", + info->endpoints().size()); + } + if (info->endpoints()[0].app_metadata != "foobar") { + return Status::Invalid("FlightEndpoint app_metadata should be 'foobar', got: ", + info->endpoints()[0].app_metadata); + } + return Status::OK(); + } +}; + /// \brief Schema to be returned for mocking the statement/prepared statement results. /// /// Must be the same across all languages. @@ -1897,6 +1955,9 @@ Status GetScenario(const std::string& scenario_name, std::shared_ptr* } else if (scenario_name == "poll_flight_info") { *out = std::make_shared(); return Status::OK(); + } else if (scenario_name == "app_metadata_flight_info_endpoint") { + *out = std::make_shared(); + return Status::OK(); } else if (scenario_name == "flight_sql") { *out = std::make_shared(); return Status::OK(); diff --git a/cpp/src/arrow/flight/perf_server.cc b/cpp/src/arrow/flight/perf_server.cc index a768840c0295d..40f6cbcbf0d82 100644 --- a/cpp/src/arrow/flight/perf_server.cc +++ b/cpp/src/arrow/flight/perf_server.cc @@ -196,7 +196,7 @@ class FlightPerfServer : public FlightServerBase { perf_request.stream_count() * perf_request.records_per_stream(); *info = std::make_unique( - MakeFlightInfo(*perf_schema_, request, endpoints, total_records, -1, false)); + MakeFlightInfo(*perf_schema_, request, endpoints, total_records, -1, false, "")); return Status::OK(); } diff --git a/cpp/src/arrow/flight/serialization_internal.cc b/cpp/src/arrow/flight/serialization_internal.cc index f85b451dccec3..64a40564afd72 100644 --- a/cpp/src/arrow/flight/serialization_internal.cc +++ b/cpp/src/arrow/flight/serialization_internal.cc @@ -177,6 +177,7 @@ Status FromProto(const pb::FlightEndpoint& pb_endpoint, FlightEndpoint* endpoint RETURN_NOT_OK(FromProto(pb_endpoint.expiration_time(), &expiration_time)); endpoint->expiration_time = std::move(expiration_time); } + endpoint->app_metadata = pb_endpoint.app_metadata(); return Status::OK(); } @@ -190,6 +191,7 @@ Status ToProto(const FlightEndpoint& endpoint, pb::FlightEndpoint* pb_endpoint) RETURN_NOT_OK(ToProto(endpoint.expiration_time.value(), pb_endpoint->mutable_expiration_time())); } + pb_endpoint->set_app_metadata(endpoint.app_metadata); return Status::OK(); } @@ -255,6 +257,7 @@ arrow::Result FromProto(const pb::FlightInfo& pb_info) { info.total_records = pb_info.total_records(); info.total_bytes = pb_info.total_bytes(); info.ordered = pb_info.ordered(); + info.app_metadata = pb_info.app_metadata(); return FlightInfo(std::move(info)); } @@ -296,6 +299,7 @@ Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info) { pb_info->set_total_records(info.total_records()); pb_info->set_total_bytes(info.total_bytes()); pb_info->set_ordered(info.ordered()); + pb_info->set_app_metadata(info.app_metadata()); return Status::OK(); } diff --git a/cpp/src/arrow/flight/sql/server.cc b/cpp/src/arrow/flight/sql/server.cc index 7a1c394fb413b..a6d197d15b2c0 100644 --- a/cpp/src/arrow/flight/sql/server.cc +++ b/cpp/src/arrow/flight/sql/server.cc @@ -930,7 +930,7 @@ arrow::Result> FlightSqlServerBase::GetFlightInfoSql } std::vector endpoints{ - FlightEndpoint{{descriptor.cmd}, {}, std::nullopt}}; + FlightEndpoint{{descriptor.cmd}, {}, std::nullopt, {}}}; ARROW_ASSIGN_OR_RAISE( auto result, FlightInfo::Make(*SqlSchema::GetSqlInfoSchema(), descriptor, endpoints, -1, -1, false)) diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc index fed2b59abf56d..bf2f4c2b4effc 100644 --- a/cpp/src/arrow/flight/test_util.cc +++ b/cpp/src/arrow/flight/test_util.cc @@ -531,9 +531,11 @@ std::unique_ptr ExampleTestServer() { FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor, const std::vector& endpoints, - int64_t total_records, int64_t total_bytes, bool ordered) { - EXPECT_OK_AND_ASSIGN(auto info, FlightInfo::Make(schema, descriptor, endpoints, - total_records, total_bytes, ordered)); + int64_t total_records, int64_t total_bytes, bool ordered, + std::string app_metadata) { + EXPECT_OK_AND_ASSIGN(auto info, + FlightInfo::Make(schema, descriptor, endpoints, total_records, + total_bytes, ordered, std::move(app_metadata))); return info; } @@ -602,11 +604,11 @@ std::vector ExampleFlightInfo() { Location location4 = *Location::ForGrpcTcp("foo4.bar.com", 12345); Location location5 = *Location::ForGrpcTcp("foo5.bar.com", 12345); - FlightEndpoint endpoint1({{"ticket-ints-1"}, {location1}, std::nullopt}); - FlightEndpoint endpoint2({{"ticket-ints-2"}, {location2}, std::nullopt}); - FlightEndpoint endpoint3({{"ticket-cmd"}, {location3}, std::nullopt}); - FlightEndpoint endpoint4({{"ticket-dicts-1"}, {location4}, std::nullopt}); - FlightEndpoint endpoint5({{"ticket-floats-1"}, {location5}, std::nullopt}); + FlightEndpoint endpoint1({{"ticket-ints-1"}, {location1}, std::nullopt, {}}); + FlightEndpoint endpoint2({{"ticket-ints-2"}, {location2}, std::nullopt, {}}); + FlightEndpoint endpoint3({{"ticket-cmd"}, {location3}, std::nullopt, {}}); + FlightEndpoint endpoint4({{"ticket-dicts-1"}, {location4}, std::nullopt, {}}); + FlightEndpoint endpoint5({{"ticket-floats-1"}, {location5}, std::nullopt, {}}); FlightDescriptor descr1{FlightDescriptor::PATH, "", {"examples", "ints"}}; FlightDescriptor descr2{FlightDescriptor::CMD, "my_command", {}}; @@ -619,10 +621,10 @@ std::vector ExampleFlightInfo() { auto schema4 = ExampleFloatSchema(); return { - MakeFlightInfo(*schema1, descr1, {endpoint1, endpoint2}, 1000, 100000, false), - MakeFlightInfo(*schema2, descr2, {endpoint3}, 1000, 100000, false), - MakeFlightInfo(*schema3, descr3, {endpoint4}, -1, -1, false), - MakeFlightInfo(*schema4, descr4, {endpoint5}, 1000, 100000, false), + MakeFlightInfo(*schema1, descr1, {endpoint1, endpoint2}, 1000, 100000, false, ""), + MakeFlightInfo(*schema2, descr2, {endpoint3}, 1000, 100000, false, ""), + MakeFlightInfo(*schema3, descr3, {endpoint4}, -1, -1, false, ""), + MakeFlightInfo(*schema4, descr4, {endpoint5}, 1000, 100000, false, ""), }; } diff --git a/cpp/src/arrow/flight/test_util.h b/cpp/src/arrow/flight/test_util.h index f299d358c539e..c0b42d9b90c5a 100644 --- a/cpp/src/arrow/flight/test_util.h +++ b/cpp/src/arrow/flight/test_util.h @@ -192,7 +192,8 @@ std::vector ExampleActionTypes(); ARROW_FLIGHT_EXPORT FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor, const std::vector& endpoints, - int64_t total_records, int64_t total_bytes, bool ordered); + int64_t total_records, int64_t total_bytes, bool ordered, + std::string app_metadata); // ---------------------------------------------------------------------- // A pair of authentication handlers that check for a predefined password diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index a24a9ef8758ab..9da83fa8a11f2 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -260,13 +260,14 @@ arrow::Result FlightInfo::Make(const Schema& schema, const FlightDescriptor& descriptor, const std::vector& endpoints, int64_t total_records, int64_t total_bytes, - bool ordered) { + bool ordered, std::string app_metadata) { FlightInfo::Data data; data.descriptor = descriptor; data.endpoints = endpoints; data.total_records = total_records; data.total_bytes = total_bytes; data.ordered = ordered; + data.app_metadata = std::move(app_metadata); RETURN_NOT_OK(internal::SchemaToString(schema, &data.schema)); return FlightInfo(data); } @@ -328,6 +329,7 @@ std::string FlightInfo::ToString() const { ss << "] total_records=" << data_.total_records; ss << " total_bytes=" << data_.total_bytes; ss << " ordered=" << (data_.ordered ? "true" : "false"); + ss << " app_metadata='" << HexEncode(data_.app_metadata) << "'"; ss << '>'; return ss.str(); } @@ -338,7 +340,8 @@ bool FlightInfo::Equals(const FlightInfo& other) const { data_.endpoints == other.data_.endpoints && data_.total_records == other.data_.total_records && data_.total_bytes == other.data_.total_bytes && - data_.ordered == other.data_.ordered; + data_.ordered == other.data_.ordered && + data_.app_metadata == other.data_.app_metadata; } arrow::Result PollInfo::SerializeToString() const { @@ -535,6 +538,7 @@ std::string FlightEndpoint::ToString() const { } else { ss << "null"; } + ss << " app_metadata='" << HexEncode(app_metadata) << "'"; ss << ">"; return ss.str(); } @@ -554,6 +558,9 @@ bool FlightEndpoint::Equals(const FlightEndpoint& other) const { return false; } } + if (app_metadata != other.app_metadata) { + return false; + } return true; } diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 70e7c90676b73..40a0787d14a7a 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -481,6 +481,9 @@ struct ARROW_FLIGHT_EXPORT FlightEndpoint { /// retrying DoGet requests. std::optional expiration_time; + /// Opaque Application-defined metadata + std::string app_metadata; + std::string ToString() const; bool Equals(const FlightEndpoint& other) const; @@ -583,6 +586,7 @@ class ARROW_FLIGHT_EXPORT FlightInfo { int64_t total_records = -1; int64_t total_bytes = -1; bool ordered = false; + std::string app_metadata; }; explicit FlightInfo(Data data) : data_(std::move(data)), reconstructed_schema_(false) {} @@ -592,7 +596,8 @@ class ARROW_FLIGHT_EXPORT FlightInfo { const FlightDescriptor& descriptor, const std::vector& endpoints, int64_t total_records, int64_t total_bytes, - bool ordered = false); + bool ordered = false, + std::string app_metadata = ""); /// \brief Deserialize the Arrow schema of the dataset. Populate any /// dictionary encoded fields into a DictionaryMemo for @@ -621,6 +626,9 @@ class ARROW_FLIGHT_EXPORT FlightInfo { /// Whether endpoints are in the same order as the data. bool ordered() const { return data_.ordered; } + /// Application-defined opaque metadata + const std::string& app_metadata() const { return data_.app_metadata; } + /// \brief Get the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 0def0e036e3c1..6e801e1f8adb7 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -205,7 +205,10 @@ class ArrayLoader { } } - Status LoadType(const DataType& type) { return VisitTypeInline(type, this); } + Status LoadType(const DataType& type) { + DCHECK_NE(out_, nullptr); + return VisitTypeInline(type, this); + } Status Load(const Field* field, ArrayData* out) { if (max_recursion_depth_ <= 0) { @@ -223,6 +226,9 @@ class ArrayLoader { skip_io_ = true; Status status = Load(field, &dummy); skip_io_ = false; + // GH-37851: Reset state. Load will set `out_` to `&dummy`, which would + // be a dangling pointer. + out_ = nullptr; return status; } @@ -258,6 +264,7 @@ class ArrayLoader { } Status LoadCommon(Type::type type_id) { + DCHECK_NE(out_, nullptr); // This only contains the length and null count, which we need to figure // out what to do with the buffers. For example, if null_count == 0, then // we can skip that buffer without reading from shared memory @@ -276,6 +283,7 @@ class ArrayLoader { template Status LoadPrimitive(Type::type type_id) { + DCHECK_NE(out_, nullptr); out_->buffers.resize(2); RETURN_NOT_OK(LoadCommon(type_id)); @@ -290,6 +298,7 @@ class ArrayLoader { template Status LoadBinary(Type::type type_id) { + DCHECK_NE(out_, nullptr); out_->buffers.resize(3); RETURN_NOT_OK(LoadCommon(type_id)); @@ -299,6 +308,7 @@ class ArrayLoader { template Status LoadList(const TYPE& type) { + DCHECK_NE(out_, nullptr); out_->buffers.resize(2); RETURN_NOT_OK(LoadCommon(type.id())); @@ -313,6 +323,7 @@ class ArrayLoader { } Status LoadChildren(const std::vector>& child_fields) { + DCHECK_NE(out_, nullptr); ArrayData* parent = out_; parent->child_data.resize(child_fields.size()); @@ -2010,7 +2021,7 @@ class StreamDecoder::StreamDecoderImpl : public StreamDecoderInternal { }; StreamDecoder::StreamDecoder(std::shared_ptr listener, IpcReadOptions options) { - impl_.reset(new StreamDecoderImpl(std::move(listener), options)); + impl_ = std::make_unique(std::move(listener), options); } StreamDecoder::~StreamDecoder() {} diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 3d294a3fa8642..47bf52660ffe9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1847,14 +1847,18 @@ std::vector Schema::GetAllFieldIndices(const std::string& name) const { return result; } +Status Schema::CanReferenceFieldByName(const std::string& name) const { + if (GetFieldByName(name) == nullptr) { + return Status::Invalid("Field named '", name, + "' not found or not unique in the schema."); + } + return Status::OK(); +} + Status Schema::CanReferenceFieldsByNames(const std::vector& names) const { for (const auto& name : names) { - if (GetFieldByName(name) == nullptr) { - return Status::Invalid("Field named '", name, - "' not found or not unique in the schema."); - } + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); } - return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 718540d449226..19910979287cc 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2048,6 +2048,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, /// Return the indices of all fields having this name std::vector GetAllFieldIndices(const std::string& name) const; + /// Indicate if field named `name` can be found unambiguously in the schema. + Status CanReferenceFieldByName(const std::string& name) const; + /// Indicate if fields named `names` can be found unambiguously in the schema. Status CanReferenceFieldsByNames(const std::vector& names) const; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index c55b33b4151e4..3dbefdcf0c564 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -548,6 +548,24 @@ TEST_F(TestSchema, GetFieldDuplicates) { ASSERT_EQ(results.size(), 0); } +TEST_F(TestSchema, CanReferenceFieldByName) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f1", list(int16())); + + auto schema = ::arrow::schema({f0, f1, f2, f3}); + + ASSERT_OK(schema->CanReferenceFieldByName("f0")); + ASSERT_OK(schema->CanReferenceFieldByName("f2")); + + // Not found + ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("nope")); + + // Duplicates + ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("f1")); +} + TEST_F(TestSchema, CanReferenceFieldsByNames) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index db260b5acc933..d2810c39f723c 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -31,8 +31,6 @@ if(ARROW_WITH_ZSTD AND "${zstd_SOURCE}" STREQUAL "SYSTEM") provide_find_module(zstdAlt "Gandiva") endif() -add_definitions(-DGANDIVA_LLVM_VERSION=${LLVM_VERSION_MAJOR}) - # Set the path where the bitcode file generated, see precompiled/CMakeLists.txt set(GANDIVA_PRECOMPILED_BC_PATH "${CMAKE_CURRENT_BINARY_DIR}/irhelpers.bc") set(GANDIVA_PRECOMPILED_CC_PATH "${CMAKE_CURRENT_BINARY_DIR}/precompiled_bitcode.cc") diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 7d75793a3e9e7..b6c78da89d575 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -53,18 +53,33 @@ #include #include #include +#if LLVM_VERSION_MAJOR >= 17 +#include +#else #include +#endif +#include #include #include +#include +#include #if LLVM_VERSION_MAJOR >= 14 +#include #include +#include +#include +#include +#include +#include +#include +#include #else #include +#include #endif #include #include #include -#include #include #include #include @@ -268,49 +283,104 @@ Status Engine::LoadPreCompiledIR() { // a pass for dead code elimination. Status Engine::RemoveUnusedFunctions() { // Setup an optimiser pipeline - std::unique_ptr pass_manager( - new llvm::legacy::PassManager()); + llvm::PassBuilder pass_builder; + llvm::ModuleAnalysisManager module_am; + + pass_builder.registerModuleAnalyses(module_am); + llvm::ModulePassManager module_pm; std::unordered_set used_functions; used_functions.insert(functions_to_compile_.begin(), functions_to_compile_.end()); - pass_manager->add( - llvm::createInternalizePass([&used_functions](const llvm::GlobalValue& func) { - return (used_functions.find(func.getName().str()) != used_functions.end()); + module_pm.addPass( + llvm::InternalizePass([&used_functions](const llvm::GlobalValue& variable) -> bool { + return used_functions.find(variable.getName().str()) != used_functions.end(); })); - pass_manager->add(llvm::createGlobalDCEPass()); - pass_manager->run(*module_); + module_pm.addPass(llvm::GlobalDCEPass()); + + module_pm.run(*module_, module_am); return Status::OK(); } +// several passes requiring LLVM 14+ that are not available in the legacy pass manager +#if LLVM_VERSION_MAJOR >= 14 +static void OptimizeModuleWithNewPassManager(llvm::Module& module, + llvm::TargetIRAnalysis target_analysis) { + // Setup an optimiser pipeline + llvm::PassBuilder pass_builder; + llvm::LoopAnalysisManager loop_am; + llvm::FunctionAnalysisManager function_am; + llvm::CGSCCAnalysisManager cgscc_am; + llvm::ModuleAnalysisManager module_am; + + function_am.registerPass([&] { return target_analysis; }); + + // Register required analysis managers + pass_builder.registerModuleAnalyses(module_am); + pass_builder.registerCGSCCAnalyses(cgscc_am); + pass_builder.registerFunctionAnalyses(function_am); + pass_builder.registerLoopAnalyses(loop_am); + pass_builder.crossRegisterProxies(loop_am, function_am, cgscc_am, module_am); + + pass_builder.registerPipelineStartEPCallback([&](llvm::ModulePassManager& module_pm, + llvm::OptimizationLevel Level) { + module_pm.addPass(llvm::ModuleInlinerPass()); + + llvm::FunctionPassManager function_pm; + function_pm.addPass(llvm::InstCombinePass()); + function_pm.addPass(llvm::PromotePass()); + function_pm.addPass(llvm::GVNPass()); + function_pm.addPass(llvm::NewGVNPass()); + function_pm.addPass(llvm::SimplifyCFGPass()); + function_pm.addPass(llvm::LoopVectorizePass()); + function_pm.addPass(llvm::SLPVectorizerPass()); + module_pm.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(function_pm))); + + module_pm.addPass(llvm::GlobalOptPass()); + }); + + pass_builder.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3) + .run(module, module_am); +} +#else +static void OptimizeModuleWithLegacyPassManager(llvm::Module& module, + llvm::TargetIRAnalysis target_analysis) { + std::unique_ptr pass_manager( + new llvm::legacy::PassManager()); + + pass_manager->add(llvm::createTargetTransformInfoWrapperPass(target_analysis)); + pass_manager->add(llvm::createFunctionInliningPass()); + pass_manager->add(llvm::createInstructionCombiningPass()); + pass_manager->add(llvm::createPromoteMemoryToRegisterPass()); + pass_manager->add(llvm::createGVNPass()); + pass_manager->add(llvm::createNewGVNPass()); + pass_manager->add(llvm::createCFGSimplificationPass()); + pass_manager->add(llvm::createLoopVectorizePass()); + pass_manager->add(llvm::createSLPVectorizerPass()); + pass_manager->add(llvm::createGlobalOptimizerPass()); + + // run the optimiser + llvm::PassManagerBuilder pass_builder; + pass_builder.OptLevel = 3; + pass_builder.populateModulePassManager(*pass_manager); + pass_manager->run(module); +} +#endif + // Optimise and compile the module. Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); if (optimize_) { - // misc passes to allow for inlining, vectorization, .. - std::unique_ptr pass_manager( - new llvm::legacy::PassManager()); - - llvm::TargetIRAnalysis target_analysis = - execution_engine_->getTargetMachine()->getTargetIRAnalysis(); - pass_manager->add(llvm::createTargetTransformInfoWrapperPass(target_analysis)); - pass_manager->add(llvm::createFunctionInliningPass()); - pass_manager->add(llvm::createInstructionCombiningPass()); - pass_manager->add(llvm::createPromoteMemoryToRegisterPass()); - pass_manager->add(llvm::createGVNPass()); - pass_manager->add(llvm::createNewGVNPass()); - pass_manager->add(llvm::createCFGSimplificationPass()); - pass_manager->add(llvm::createLoopVectorizePass()); - pass_manager->add(llvm::createSLPVectorizerPass()); - pass_manager->add(llvm::createGlobalOptimizerPass()); - - // run the optimiser - llvm::PassManagerBuilder pass_builder; - pass_builder.OptLevel = 3; - pass_builder.populateModulePassManager(*pass_manager); - pass_manager->run(*module_); + auto target_analysis = execution_engine_->getTargetMachine()->getTargetIRAnalysis(); + +// misc passes to allow for inlining, vectorization, .. +#if LLVM_VERSION_MAJOR >= 14 + OptimizeModuleWithNewPassManager(*module_, target_analysis); +#else + OptimizeModuleWithLegacyPassManager(*module_, target_analysis); +#endif } ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 6fe1ce9da60fe..fa013dd2ea583 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -363,10 +363,8 @@ void SerializedPageReader::UpdateDecryption(const std::shared_ptr& de int8_t module_type, std::string* page_aad) { ARROW_DCHECK(decryptor != nullptr); if (crypto_ctx_.start_decrypt_with_dictionary_page) { - std::string aad = encryption::CreateModuleAad( - decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal, - crypto_ctx_.column_ordinal, kNonPageOrdinal); - decryptor->UpdateAad(aad); + UpdateDecryptor(decryptor, crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, + module_type); } else { encryption::QuickUpdatePageAad(page_ordinal_, page_aad); decryptor->UpdateAad(*page_aad); @@ -449,7 +447,7 @@ std::shared_ptr SerializedPageReader::NextPage() { current_page_header_ = format::PageHeader(); deserializer.DeserializeMessage(reinterpret_cast(view.data()), &header_size, ¤t_page_header_, - crypto_ctx_.meta_decryptor); + crypto_ctx_.meta_decryptor.get()); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index ae9216ba7c312..a0aedeee9e968 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -330,7 +330,7 @@ class SerializedPageWriter : public PageWriter { UpdateEncryption(encryption::kDictionaryPageHeader); } const int64_t header_size = - thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_.get()); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); @@ -422,7 +422,7 @@ class SerializedPageWriter : public PageWriter { UpdateEncryption(encryption::kDataPageHeader); } const int64_t header_size = - thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_.get()); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); /// Collect page index diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e3c8ab196f45e..931b9fd10729f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline; using arrow::internal::AddWithOverflow; using arrow::internal::checked_cast; using arrow::internal::MultiplyWithOverflow; +using arrow::internal::SafeSignedSubtract; using arrow::internal::SubtractWithOverflow; using arrow::util::SafeLoad; using arrow::util::SafeLoadAs; @@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder deltas_; + T first_value_{0}; + T current_value_{0}; + ArrowPoolVector deltas_; std::shared_ptr bits_buffer_; ::arrow::BufferBuilder sink_; ::arrow::bit_util::BitWriter bit_writer_; @@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder::Put(const T* src, int num_values) { total_value_count_ += num_values; while (idx < num_values) { - UT value = static_cast(src[idx]); + T value = src[idx]; // Calculate deltas. The possible overflow is handled by use of unsigned integers // making subtraction operations well-defined and correct even in case of overflow. // Encoded integers will wrap back around on decoding. // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n - deltas_[values_current_block_] = value - current_value_; + deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_); current_value_ = value; idx++; values_current_block_++; @@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder::FlushBlock() { return; } - const UT min_delta = + // Calculate the frame of reference for this miniblock. This value will be subtracted + // from all deltas to guarantee all deltas are positive for encoding. + const T min_delta = *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_); - bit_writer_.PutZigZagVlqInt(static_cast(min_delta)); + bit_writer_.PutZigZagVlqInt(min_delta); // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write // bit widths of miniblocks as they become known during the encoding. @@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder::FlushBlock() { std::min(values_per_mini_block_, values_current_block_); const uint32_t start = i * values_per_mini_block_; - const UT max_delta = *std::max_element( + const T max_delta = *std::max_element( deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block); // The minimum number of bits required to write any of values in deltas_ vector. // See overflow comment above. - const auto bit_width = bit_width_data[i] = - bit_util::NumRequiredBits(max_delta - min_delta); + const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits( + static_cast(max_delta) - static_cast(min_delta)); for (uint32_t j = start; j < start + values_current_mini_block; j++) { - // See overflow comment above. - const UT value = deltas_[j] - min_delta; + // Convert delta to frame of reference. See overflow comment above. + const UT value = static_cast(deltas_[j]) - static_cast(min_delta); bit_writer_.PutValue(value, bit_width); } // If there are not enough values to fill the last mini block, we pad the mini block @@ -3300,7 +3303,11 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode void SetData(int num_values, const uint8_t* data, int len) override { num_values_ = num_values; - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_) { + decoder_->Reset(data, len); + } else { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } prefix_len_decoder_.SetDecoder(num_values, decoder_); // get the number of encoded prefix lengths @@ -3323,7 +3330,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode // TODO: read corrupted files written with bug(PARQUET-246). last_value_ should be set // to last_value_in_previous_page_ when decoding a new page(except the first page) - last_value_ = ""; + last_value_.clear(); } int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, @@ -3342,6 +3349,43 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode } protected: + template + static void BuildBufferInternal(const int32_t* prefix_len_ptr, int i, ByteArray* buffer, + std::string_view* prefix, uint8_t** data_ptr) { + if (ARROW_PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix->length())) { + throw ParquetException("prefix length too large in DELTA_BYTE_ARRAY"); + } + // For now, `buffer` points to string suffixes, and the suffix decoder + // ensures that the suffix data has sufficient lifetime. + if (prefix_len_ptr[i] == 0) { + // prefix is empty: buffer[i] already points to the suffix. + *prefix = std::string_view{buffer[i]}; + return; + } + DCHECK_EQ(is_first_run, i == 0); + if constexpr (!is_first_run) { + if (buffer[i].len == 0) { + // suffix is empty: buffer[i] can simply point to the prefix. + // This is not possible for the first run since the prefix + // would point to the mutable `last_value_`. + *prefix = prefix->substr(0, prefix_len_ptr[i]); + buffer[i] = ByteArray(*prefix); + return; + } + } + // Both prefix and suffix are non-empty, so we need to decode the string + // into `data_ptr`. + // 1. Copy the prefix + memcpy(*data_ptr, prefix->data(), prefix_len_ptr[i]); + // 2. Copy the suffix. + memcpy(*data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); + // 3. Point buffer[i] to the decoded string. + buffer[i].ptr = *data_ptr; + buffer[i].len += prefix_len_ptr[i]; + *data_ptr += buffer[i].len; + *prefix = std::string_view{buffer[i]}; + } + int GetInternal(ByteArray* buffer, int max_values) { // Decode up to `max_values` strings into an internal buffer // and reference them into `buffer`. @@ -3362,9 +3406,19 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode reinterpret_cast(buffered_prefix_length_->data()) + prefix_len_offset_; for (int i = 0; i < max_values; ++i) { + if (prefix_len_ptr[i] == 0) { + // We don't need to copy the suffix if the prefix length is 0. + continue; + } if (ARROW_PREDICT_FALSE(prefix_len_ptr[i] < 0)) { throw ParquetException("negative prefix length in DELTA_BYTE_ARRAY"); } + if (buffer[i].len == 0 && i != 0) { + // We don't need to copy the prefix if the suffix length is 0 + // and this is not the first run (that is, the prefix doesn't point + // to the mutable `last_value_`). + continue; + } if (ARROW_PREDICT_FALSE(AddWithOverflow(data_size, prefix_len_ptr[i], &data_size) || AddWithOverflow(data_size, buffer[i].len, &data_size))) { throw ParquetException("excess expansion in DELTA_BYTE_ARRAY"); @@ -3374,18 +3428,15 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode string_view prefix{last_value_}; uint8_t* data_ptr = buffered_data_->mutable_data(); - for (int i = 0; i < max_values; ++i) { - if (ARROW_PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { - throw ParquetException("prefix length too large in DELTA_BYTE_ARRAY"); - } - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); - buffer[i].ptr = data_ptr; - buffer[i].len += prefix_len_ptr[i]; - data_ptr += buffer[i].len; - prefix = std::string_view{buffer[i]}; + if (max_values > 0) { + BuildBufferInternal(prefix_len_ptr, 0, buffer, &prefix, + &data_ptr); + } + for (int i = 1; i < max_values; ++i) { + BuildBufferInternal(prefix_len_ptr, i, buffer, &prefix, + &data_ptr); } + DCHECK_EQ(data_ptr - buffered_data_->mutable_data(), data_size); prefix_len_offset_ += max_values; this->num_values_ -= max_values; num_valid_values_ -= max_values; diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 6726810911fd5..717c716330563 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -737,6 +737,114 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) { BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); +struct DeltaByteArrayState { + int32_t min_size = 0; + int32_t max_size; + int32_t array_length; + int32_t total_data_size = 0; + double prefixed_probability; + std::vector buf; + + explicit DeltaByteArrayState(const benchmark::State& state) + : max_size(static_cast(state.range(0))), + array_length(static_cast(state.range(1))), + prefixed_probability(state.range(2) / 100.0) {} + + std::vector MakeRandomByteArray(uint32_t seed) { + std::default_random_engine gen(seed); + std::uniform_int_distribution dist_size(min_size, max_size); + std::uniform_int_distribution dist_byte(0, 255); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_real_distribution dist_prefix_length(0, 1); + + std::vector out(array_length); + buf.resize(max_size * array_length); + auto buf_ptr = buf.data(); + total_data_size = 0; + + for (int32_t i = 0; i < array_length; ++i) { + int len = dist_size(gen); + out[i].len = len; + out[i].ptr = buf_ptr; + + bool do_prefix = i > 0 && dist_has_prefix(gen); + int prefix_len = 0; + if (do_prefix) { + int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); + prefix_len = + static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); + } + for (int j = 0; j < prefix_len; ++j) { + buf_ptr[j] = out[i - 1].ptr[j]; + } + for (int j = prefix_len; j < len; ++j) { + buf_ptr[j] = static_cast(dist_byte(gen)); + } + buf_ptr += len; + total_data_size += len; + } + return out; + } +}; + +static void BM_DeltaEncodingByteArray(benchmark::State& state) { + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + int64_t encoded_size = 0; + + for (auto _ : state) { + encoder->Put(values.data(), static_cast(values.size())); + encoded_size = encoder->FlushValues()->size(); + } + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; +} + +static void BM_DeltaDecodingByteArray(benchmark::State& state) { + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + encoder->Put(values.data(), static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + const int64_t encoded_size = buf->size(); + + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); + for (auto _ : state) { + decoder->SetData(delta_state.array_length, buf->data(), + static_cast(buf->size())); + decoder->Decode(values.data(), static_cast(values.size())); + ::benchmark::DoNotOptimize(values); + } + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; +} + +static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) { + for (int max_string_length : {8, 64, 1024}) { + for (int batch_size : {512, 2048}) { + for (int prefixed_percent : {10, 90, 99}) { + b->Args({max_string_length, batch_size, prefixed_percent}); + } + } + } + b->ArgNames({"max-string-length", "batch-size", "prefixed-percent"}); +} + +BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments); +BENCHMARK(BM_DeltaDecodingByteArray)->Apply(ByteArrayDeltaCustomArguments); + static void BM_RleEncodingBoolean(benchmark::State& state) { std::vector values(state.range(0), true); auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::RLE); diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 71dc40d33ac47..9861c317c80d9 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) { } } +// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values +// that will cause integer overflow (see GH-37939). +TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) { + using T = typename TypeParam::c_type; + + // Values that should wrap when converted to deltas, and then when converted to the + // frame of reference. + std::vector int_values = {std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::min(), + std::numeric_limits::max(), + 0, + -1, + 0, + 1, + -1, + 1}; + const int num_values = static_cast(int_values.size()); + + auto const encoder = MakeTypedEncoder( + Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get()); + encoder->Put(int_values, num_values); + auto const encoded = encoder->FlushValues(); + + auto const decoder = + MakeTypedDecoder(Encoding::DELTA_BINARY_PACKED, this->descr_.get()); + + std::vector decoded(num_values); + decoder->SetData(num_values, encoded->data(), static_cast(encoded->size())); + + const int values_decoded = decoder->Decode(decoded.data(), num_values); + + ASSERT_EQ(num_values, values_decoded); + ASSERT_NO_FATAL_FAILURE( + VerifyResults(decoded.data(), int_values.data(), num_values)); +} + +// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than +// necessary (see GH-37939). +TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) { + using T = typename TypeParam::c_type; + constexpr int num_values = 128; + // 128 values should be <= 1 block of values encoded with 2 bits + // delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes) + // mini-block header should be 0x01 0x02020202 (5 bytes) + constexpr int encoded_size = 2 * num_values / 8 + 6 + 5; + + // Create a run of {1, 0, -1, 0, 1, 0, ...}. + // min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits + // to encode. + std::vector int_values(num_values); + std::iota(int_values.begin(), int_values.end(), 0); + std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) { + return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1; + }); + + auto const encoder = MakeTypedEncoder( + Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get()); + encoder->Put(int_values, num_values); + auto const encoded = encoder->FlushValues(); + + ASSERT_EQ(encoded->size(), encoded_size); +} + // ---------------------------------------------------------------------- // Rle for Boolean encode/decode tests. diff --git a/cpp/src/parquet/encryption/encryption_internal.h b/cpp/src/parquet/encryption/encryption_internal.h index 4ed5b5cf61243..77921d8731d25 100644 --- a/cpp/src/parquet/encryption/encryption_internal.h +++ b/cpp/src/parquet/encryption/encryption_internal.h @@ -40,6 +40,8 @@ constexpr int8_t kDataPageHeader = 4; constexpr int8_t kDictionaryPageHeader = 5; constexpr int8_t kColumnIndex = 6; constexpr int8_t kOffsetIndex = 7; +constexpr int8_t kBloomFilterHeader = 8; +constexpr int8_t kBloomFilterBitset = 9; /// Performs AES encryption operations with GCM or CTR ciphers. class AesEncryptor { diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.cc b/cpp/src/parquet/encryption/internal_file_decryptor.cc index 87bfc2bd12047..19e4845c8732d 100644 --- a/cpp/src/parquet/encryption/internal_file_decryptor.cc +++ b/cpp/src/parquet/encryption/internal_file_decryptor.cc @@ -16,8 +16,10 @@ // under the License. #include "parquet/encryption/internal_file_decryptor.h" +#include "arrow/util/logging.h" #include "parquet/encryption/encryption.h" #include "parquet/encryption/encryption_internal.h" +#include "parquet/metadata.h" namespace parquet { @@ -215,4 +217,57 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( return column_data_map_[column_path]; } +namespace { + +std::shared_ptr GetColumnDecryptor( + const ColumnCryptoMetaData* crypto_metadata, InternalFileDecryptor* file_decryptor, + const std::function( + InternalFileDecryptor* file_decryptor, const std::string& column_path, + const std::string& column_key_metadata, const std::string& aad)>& func, + bool metadata) { + if (crypto_metadata == nullptr) { + return nullptr; + } + + if (file_decryptor == nullptr) { + throw ParquetException("RowGroup is noted as encrypted but no file decryptor"); + } + + if (crypto_metadata->encrypted_with_footer_key()) { + return metadata ? file_decryptor->GetFooterDecryptorForColumnMeta() + : file_decryptor->GetFooterDecryptorForColumnData(); + } + + // The column is encrypted with its own key + const std::string& column_key_metadata = crypto_metadata->key_metadata(); + const std::string column_path = crypto_metadata->path_in_schema()->ToDotString(); + return func(file_decryptor, column_path, column_key_metadata, /*aad=*/""); +} + +} // namespace + +std::shared_ptr GetColumnMetaDecryptor( + const ColumnCryptoMetaData* crypto_metadata, InternalFileDecryptor* file_decryptor) { + return GetColumnDecryptor(crypto_metadata, file_decryptor, + &InternalFileDecryptor::GetColumnMetaDecryptor, + /*metadata=*/true); +} + +std::shared_ptr GetColumnDataDecryptor( + const ColumnCryptoMetaData* crypto_metadata, InternalFileDecryptor* file_decryptor) { + return GetColumnDecryptor(crypto_metadata, file_decryptor, + &InternalFileDecryptor::GetColumnDataDecryptor, + /*metadata=*/false); +} + +void UpdateDecryptor(const std::shared_ptr& decryptor, + int16_t row_group_ordinal, int16_t column_ordinal, + int8_t module_type) { + ARROW_DCHECK(!decryptor->file_aad().empty()); + const std::string aad = + encryption::CreateModuleAad(decryptor->file_aad(), module_type, row_group_ordinal, + column_ordinal, kNonPageOrdinal); + decryptor->UpdateAad(aad); +} + } // namespace parquet diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.h b/cpp/src/parquet/encryption/internal_file_decryptor.h index 2f9c3952aff2d..0b27effda8822 100644 --- a/cpp/src/parquet/encryption/internal_file_decryptor.h +++ b/cpp/src/parquet/encryption/internal_file_decryptor.h @@ -31,6 +31,7 @@ class AesDecryptor; class AesEncryptor; } // namespace encryption +class ColumnCryptoMetaData; class FileDecryptionProperties; class PARQUET_EXPORT Decryptor { @@ -110,4 +111,16 @@ class InternalFileDecryptor { bool metadata = false); }; +/// Utility to get column meta decryptor of an encrypted column. +std::shared_ptr GetColumnMetaDecryptor( + const ColumnCryptoMetaData* crypto_metadata, InternalFileDecryptor* file_decryptor); + +/// Utility to get column data decryptor of an encrypted column. +std::shared_ptr GetColumnDataDecryptor( + const ColumnCryptoMetaData* crypto_metadata, InternalFileDecryptor* file_decryptor); + +void UpdateDecryptor(const std::shared_ptr& decryptor, + int16_t row_group_ordinal, int16_t column_ordinal, + int8_t module_type); + } // namespace parquet diff --git a/cpp/src/parquet/encryption/read_configurations_test.cc b/cpp/src/parquet/encryption/read_configurations_test.cc index 10de7198ac5ff..695696db293fb 100644 --- a/cpp/src/parquet/encryption/read_configurations_test.cc +++ b/cpp/src/parquet/encryption/read_configurations_test.cc @@ -36,7 +36,7 @@ * The unit-test is called multiple times, each time to decrypt parquet files using * different decryption configuration as described below. * In each call two encrypted files are read: one temporary file that was generated using - * encryption-write-configurations-test.cc test and will be deleted upon + * write_configurations_test.cc test and will be deleted upon * reading it, while the second resides in * parquet-testing/data repository. Those two encrypted files were encrypted using the * same encryption configuration. @@ -59,8 +59,8 @@ * read the footer + all non-encrypted columns. * (pairs with encryption configuration 3) * - * The encrypted parquet files that is read was encrypted using one of the configurations - * below: + * The encrypted parquet files that are read were encrypted using one of the + * configurations below: * * - Encryption configuration 1: Encrypt all columns and the footer with the same key. * (uniform encryption) @@ -166,7 +166,11 @@ class TestDecryptionConfiguration vector_of_decryption_configurations_.push_back(NULL); } - void DecryptFile(std::string file, int decryption_config_num) { + void DecryptFileInternal( + const std::string& file, int decryption_config_num, + std::function&)> + decrypt_func) { std::string exception_msg; std::shared_ptr file_decryption_properties; // if we get decryption_config_num = x then it means the actual number is x+1 @@ -176,18 +180,40 @@ class TestDecryptionConfiguration vector_of_decryption_configurations_[decryption_config_num]->DeepClone(); } - decryptor_.DecryptFile(file, file_decryption_properties); + decrypt_func(std::move(file), std::move(file_decryption_properties)); + } + + void DecryptFile(const std::string& file, int decryption_config_num) { + DecryptFileInternal( + file, decryption_config_num, + [&](const std::string& file, + const std::shared_ptr& file_decryption_properties) { + decryptor_.DecryptFile(file, file_decryption_properties); + }); + } + + void DecryptPageIndex(const std::string& file, int decryption_config_num) { + DecryptFileInternal( + file, decryption_config_num, + [&](const std::string& file, + const std::shared_ptr& file_decryption_properties) { + decryptor_.DecryptPageIndex(file, file_decryption_properties); + }); } // Check that the decryption result is as expected. - void CheckResults(const std::string file_name, unsigned decryption_config_num, - unsigned encryption_config_num) { + void CheckResults(const std::string& file_name, unsigned decryption_config_num, + unsigned encryption_config_num, bool file_has_page_index) { // Encryption_configuration number five contains aad_prefix and // disable_aad_prefix_storage. // An exception is expected to be thrown if the file is not decrypted with aad_prefix. if (encryption_config_num == 5) { if (decryption_config_num == 1 || decryption_config_num == 3) { EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + if (file_has_page_index) { + EXPECT_THROW(DecryptPageIndex(file_name, decryption_config_num - 1), + ParquetException); + } return; } } @@ -196,6 +222,10 @@ class TestDecryptionConfiguration if (decryption_config_num == 2) { if (encryption_config_num != 5 && encryption_config_num != 4) { EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + if (file_has_page_index) { + EXPECT_THROW(DecryptPageIndex(file_name, decryption_config_num - 1), + ParquetException); + } return; } } @@ -205,6 +235,9 @@ class TestDecryptionConfiguration return; } EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); + if (file_has_page_index) { + EXPECT_NO_THROW(DecryptPageIndex(file_name, decryption_config_num - 1)); + } } // Returns true if file exists. Otherwise returns false. @@ -217,14 +250,13 @@ class TestDecryptionConfiguration // Read encrypted parquet file. // The test reads two parquet files that were encrypted using the same encryption // configuration: -// one was generated in encryption-write-configurations-test.cc tests and is deleted +// one was generated in write_configurations_test.cc tests and is deleted // once the file is read and the second exists in parquet-testing/data folder. // The name of the files are passed as parameters to the unit-test. TEST_P(TestDecryptionConfiguration, TestDecryption) { int encryption_config_num = std::get<0>(GetParam()); const char* param_file_name = std::get<1>(GetParam()); - // Decrypt parquet file that was generated in encryption-write-configurations-test.cc - // test. + // Decrypt parquet file that was generated in write_configurations_test.cc test. std::string tmp_file_name = "tmp_" + std::string(param_file_name); std::string file_name = temp_dir->path().ToString() + tmp_file_name; if (!fexists(file_name)) { @@ -237,7 +269,8 @@ TEST_P(TestDecryptionConfiguration, TestDecryption) { // parquet file. for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { unsigned decryption_config_num = index + 1; - CheckResults(file_name, decryption_config_num, encryption_config_num); + CheckResults(file_name, decryption_config_num, encryption_config_num, + /*file_has_page_index=*/true); } // Delete temporary test file. ASSERT_EQ(std::remove(file_name.c_str()), 0); @@ -255,7 +288,8 @@ TEST_P(TestDecryptionConfiguration, TestDecryption) { // parquet file. for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { unsigned decryption_config_num = index + 1; - CheckResults(file_name, decryption_config_num, encryption_config_num); + CheckResults(file_name, decryption_config_num, encryption_config_num, + /*file_has_page_index=*/false); } } diff --git a/cpp/src/parquet/encryption/test_encryption_util.cc b/cpp/src/parquet/encryption/test_encryption_util.cc index 694ed3cf42d9e..4fa215312f265 100644 --- a/cpp/src/parquet/encryption/test_encryption_util.cc +++ b/cpp/src/parquet/encryption/test_encryption_util.cc @@ -19,14 +19,17 @@ // Parquet column chunk within a row group. It could be extended in the future // to iterate through all data pages in all chunks in a file. +#include #include -#include - +#include "arrow/io/file.h" #include "arrow/testing/future_util.h" +#include "arrow/util/unreachable.h" + #include "parquet/encryption/test_encryption_util.h" #include "parquet/file_reader.h" #include "parquet/file_writer.h" +#include "parquet/page_index.h" #include "parquet/test_util.h" using ::arrow::io::FileOutputStream; @@ -206,6 +209,7 @@ void FileEncryptor::EncryptFile( WriterProperties::Builder prop_builder; prop_builder.compression(parquet::Compression::UNCOMPRESSED); prop_builder.encryption(encryption_configurations); + prop_builder.enable_write_page_index(); std::shared_ptr writer_properties = prop_builder.build(); PARQUET_ASSIGN_OR_THROW(auto out_file, FileOutputStream::Open(file)); @@ -340,8 +344,8 @@ void ReadAndVerifyColumn(RowGroupReader* rg_reader, RowGroupMetadata* rg_md, } void FileDecryptor::DecryptFile( - std::string file, - std::shared_ptr file_decryption_properties) { + const std::string& file, + const std::shared_ptr& file_decryption_properties) { std::string exception_msg; parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); if (file_decryption_properties) { @@ -353,7 +357,7 @@ void FileDecryptor::DecryptFile( source, ::arrow::io::ReadableFile::Open(file, reader_properties.memory_pool())); auto file_reader = parquet::ParquetFileReader::Open(source, reader_properties); - CheckFile(file_reader.get(), file_decryption_properties.get()); + CheckFile(file_reader.get(), file_decryption_properties); if (file_decryption_properties) { reader_properties.file_decryption_properties(file_decryption_properties->DeepClone()); @@ -361,14 +365,15 @@ void FileDecryptor::DecryptFile( auto fut = parquet::ParquetFileReader::OpenAsync(source, reader_properties); ASSERT_FINISHES_OK(fut); ASSERT_OK_AND_ASSIGN(file_reader, fut.MoveResult()); - CheckFile(file_reader.get(), file_decryption_properties.get()); + CheckFile(file_reader.get(), file_decryption_properties); file_reader->Close(); PARQUET_THROW_NOT_OK(source->Close()); } -void FileDecryptor::CheckFile(parquet::ParquetFileReader* file_reader, - FileDecryptionProperties* file_decryption_properties) { +void FileDecryptor::CheckFile( + parquet::ParquetFileReader* file_reader, + const std::shared_ptr& file_decryption_properties) { // Get the File MetaData std::shared_ptr file_metadata = file_reader->metadata(); @@ -509,4 +514,161 @@ void FileDecryptor::CheckFile(parquet::ParquetFileReader* file_reader, } } +void FileDecryptor::DecryptPageIndex( + const std::string& file, + const std::shared_ptr& file_decryption_properties) { + std::string exception_msg; + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + if (file_decryption_properties) { + reader_properties.file_decryption_properties(file_decryption_properties->DeepClone()); + } + + std::shared_ptr<::arrow::io::RandomAccessFile> source; + PARQUET_ASSIGN_OR_THROW( + source, ::arrow::io::ReadableFile::Open(file, reader_properties.memory_pool())); + + auto file_reader = parquet::ParquetFileReader::Open(source, reader_properties); + CheckPageIndex(file_reader.get(), file_decryption_properties); + + ASSERT_NO_FATAL_FAILURE(file_reader->Close()); + PARQUET_THROW_NOT_OK(source->Close()); +} + +template +void AssertColumnIndex(const std::shared_ptr& column_index, + const std::vector& expected_null_counts, + const std::vector& expected_min_values, + const std::vector& expected_max_values) { + auto typed_column_index = + std::dynamic_pointer_cast>(column_index); + ASSERT_NE(typed_column_index, nullptr); + ASSERT_EQ(typed_column_index->null_counts(), expected_null_counts); + if constexpr (std::is_same_v) { + ASSERT_EQ(typed_column_index->min_values().size(), expected_min_values.size()); + ASSERT_EQ(typed_column_index->max_values().size(), expected_max_values.size()); + for (size_t i = 0; i < expected_min_values.size(); ++i) { + ASSERT_EQ( + FixedLenByteArrayToString(typed_column_index->min_values()[i], kFixedLength), + FixedLenByteArrayToString(expected_min_values[i], kFixedLength)); + } + for (size_t i = 0; i < expected_max_values.size(); ++i) { + ASSERT_EQ( + FixedLenByteArrayToString(typed_column_index->max_values()[i], kFixedLength), + FixedLenByteArrayToString(expected_max_values[i], kFixedLength)); + } + } else { + ASSERT_EQ(typed_column_index->min_values(), expected_min_values); + ASSERT_EQ(typed_column_index->max_values(), expected_max_values); + } +} + +void FileDecryptor::CheckPageIndex( + parquet::ParquetFileReader* file_reader, + const std::shared_ptr& file_decryption_properties) { + std::shared_ptr page_index_reader = file_reader->GetPageIndexReader(); + ASSERT_NE(page_index_reader, nullptr); + + const std::shared_ptr file_metadata = file_reader->metadata(); + const int num_row_groups = file_metadata->num_row_groups(); + const int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // We cannot read page index of encrypted columns in the plaintext mode + std::vector need_row_groups(num_row_groups); + std::iota(need_row_groups.begin(), need_row_groups.end(), 0); + std::vector need_columns; + if (file_decryption_properties == nullptr) { + need_columns = {0, 1, 2, 3, 6, 7}; + } else { + need_columns = {0, 1, 2, 3, 4, 5, 6, 7}; + } + + // Provide hint of requested columns to avoid accessing encrypted columns without + // decryption properties. + page_index_reader->WillNeed( + need_row_groups, need_columns, + PageIndexSelection{/*column_index=*/true, /*offset_index=*/true}); + + // Iterate over all the RowGroups in the file. + for (int r = 0; r < num_row_groups; ++r) { + auto row_group_page_index_reader = page_index_reader->RowGroup(r); + ASSERT_NE(row_group_page_index_reader, nullptr); + + for (int c = 0; c < num_columns; ++c) { + // Skip reading encrypted columns without decryption properties. + if (file_decryption_properties == nullptr && (c == 4 || c == 5)) { + continue; + } + + constexpr size_t kExpectedNumPages = 1; + + // Check offset index. + auto offset_index = row_group_page_index_reader->GetOffsetIndex(c); + ASSERT_NE(offset_index, nullptr); + ASSERT_EQ(offset_index->page_locations().size(), kExpectedNumPages); + const auto& first_page = offset_index->page_locations()[0]; + ASSERT_EQ(first_page.first_row_index, 0); + ASSERT_GT(first_page.compressed_page_size, 0); + + // Int96 column does not have column index. + if (c == 3) { + continue; + } + + // Check column index + auto column_index = row_group_page_index_reader->GetColumnIndex(c); + ASSERT_NE(column_index, nullptr); + ASSERT_EQ(column_index->null_pages().size(), kExpectedNumPages); + ASSERT_EQ(column_index->null_pages()[0], false); + ASSERT_EQ(column_index->encoded_min_values().size(), kExpectedNumPages); + ASSERT_EQ(column_index->encoded_max_values().size(), kExpectedNumPages); + ASSERT_TRUE(column_index->has_null_counts()); + + switch (c) { + case 0: { + AssertColumnIndex(column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{false}, + /*expected_max_values=*/{true}); + } break; + case 1: { + AssertColumnIndex(column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{0}, + /*expected_max_values=*/{49}); + } break; + case 2: { + AssertColumnIndex(column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{0}, + /*expected_max_values=*/{99000000000000}); + } break; + case 4: { + AssertColumnIndex(column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{0.0F}, + /*expected_max_values=*/{53.9F}); + } break; + case 5: { + AssertColumnIndex(column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{0.0}, + /*expected_max_values=*/{54.4444439}); + } break; + case 6: { + AssertColumnIndex( + column_index, /*expected_null_counts=*/{25}, + /*expected_min_values=*/{ByteArray("parquet000")}, + /*expected_max_values=*/{ByteArray("parquet048")}); + } break; + case 7: { + const std::vector kExpectedMinValue(kFixedLength, 0); + const std::vector kExpectedMaxValue(kFixedLength, 49); + AssertColumnIndex( + column_index, /*expected_null_counts=*/{0}, + /*expected_min_values=*/{FLBA(kExpectedMinValue.data())}, + /*expected_max_values=*/{FLBA(kExpectedMaxValue.data())}); + } break; + default: + ::arrow::Unreachable("Unexpected column index " + std::to_string(c)); + } + } + } +} + } // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/test_encryption_util.h b/cpp/src/parquet/encryption/test_encryption_util.h index 19c230ee5ff99..86aa0ff07cf84 100644 --- a/cpp/src/parquet/encryption/test_encryption_util.h +++ b/cpp/src/parquet/encryption/test_encryption_util.h @@ -113,12 +113,20 @@ class FileEncryptor { class FileDecryptor { public: - void DecryptFile(std::string file_name, - std::shared_ptr file_decryption_properties); + void DecryptFile( + const std::string& file_name, + const std::shared_ptr& file_decryption_properties); + void DecryptPageIndex( + const std::string& file_name, + const std::shared_ptr& file_decryption_properties); private: - void CheckFile(parquet::ParquetFileReader* file_reader, - FileDecryptionProperties* file_decryption_properties); + void CheckFile( + parquet::ParquetFileReader* file_reader, + const std::shared_ptr& file_decryption_properties); + void CheckPageIndex( + parquet::ParquetFileReader* file_reader, + const std::shared_ptr& file_decryption_properties); }; } // namespace encryption::test diff --git a/cpp/src/parquet/encryption/type_fwd.h b/cpp/src/parquet/encryption/type_fwd.h new file mode 100644 index 0000000000000..623811718482c --- /dev/null +++ b/cpp/src/parquet/encryption/type_fwd.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace parquet { + +class Decryptor; +class Encryptor; + +class InternalFileDecryptor; +class InternalFileEncryptor; + +} // namespace parquet diff --git a/cpp/src/parquet/encryption/write_configurations_test.cc b/cpp/src/parquet/encryption/write_configurations_test.cc index e262003db3e6a..f27da82694874 100644 --- a/cpp/src/parquet/encryption/write_configurations_test.cc +++ b/cpp/src/parquet/encryption/write_configurations_test.cc @@ -33,7 +33,7 @@ * This file contains unit-tests for writing encrypted Parquet files with * different encryption configurations. * The files are saved in temporary folder and will be deleted after reading - * them in encryption-read-configurations-test.cc test. + * them in read_configurations_test.cc test. * * A detailed description of the Parquet Modular Encryption specification can be found * here: diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 08d493b0bca2f..5247b9d4b543d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -227,37 +227,19 @@ class SerializedRowGroup : public RowGroupReader::Contents { always_compressed); } - if (file_decryptor_ == nullptr) { - throw ParquetException("RowGroup is noted as encrypted but no file decryptor"); - } + // The column is encrypted + std::shared_ptr meta_decryptor = + GetColumnMetaDecryptor(crypto_metadata.get(), file_decryptor_.get()); + std::shared_ptr data_decryptor = + GetColumnDataDecryptor(crypto_metadata.get(), file_decryptor_.get()); + ARROW_DCHECK_NE(meta_decryptor, nullptr); + ARROW_DCHECK_NE(data_decryptor, nullptr); constexpr auto kEncryptedRowGroupsLimit = 32767; if (i > kEncryptedRowGroupsLimit) { throw ParquetException("Encrypted files cannot contain more than 32767 row groups"); } - // The column is encrypted - std::shared_ptr meta_decryptor; - std::shared_ptr data_decryptor; - // The column is encrypted with footer key - if (crypto_metadata->encrypted_with_footer_key()) { - meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); - data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); - CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), meta_decryptor, data_decryptor); - return PageReader::Open(stream, col->num_values(), col->compression(), properties_, - always_compressed, &ctx); - } - - // The column is encrypted with its own key - std::string column_key_metadata = crypto_metadata->key_metadata(); - const std::string column_path = crypto_metadata->path_in_schema()->ToDotString(); - - meta_decryptor = - file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); - data_decryptor = - file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); - CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, static_cast(i), meta_decryptor, data_decryptor); return PageReader::Open(stream, col->num_values(), col->compression(), properties_, @@ -330,7 +312,7 @@ class SerializedFile : public ParquetFileReader::Contents { } if (!page_index_reader_) { page_index_reader_ = PageIndexReader::Make(source_.get(), file_metadata_, - properties_, file_decryptor_); + properties_, file_decryptor_.get()); } return page_index_reader_; } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 2a6a88df2dd0a..9a92d4525d23d 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -471,10 +471,6 @@ class FileSerializer : public ParquetFileWriter::Contents { void WritePageIndex() { if (page_index_builder_ != nullptr) { - if (properties_->file_encryption_properties()) { - throw ParquetException("Encryption is not supported with page index"); - } - // Serialize page index after all row groups have been written and report // location to the file metadata. PageIndexLocation page_index_location; @@ -533,7 +529,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } if (properties_->page_index_enabled()) { - page_index_builder_ = PageIndexBuilder::Make(&schema_); + page_index_builder_ = PageIndexBuilder::Make(&schema_, file_encryptor_.get()); } } }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8aedf5b926add..4ef2151fee59d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -211,7 +211,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { ThriftDeserializer deserializer(properties_); deserializer.DeserializeMessage( reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &decrypted_metadata_, decryptor); + &len, &decrypted_metadata_, decryptor.get()); column_metadata_ = &decrypted_metadata_; } else { throw ParquetException( @@ -603,7 +603,8 @@ class FileMetaData::FileMetaDataImpl { ThriftDeserializer deserializer(properties_); deserializer.DeserializeMessage(reinterpret_cast(metadata), - metadata_len, metadata_.get(), footer_decryptor); + metadata_len, metadata_.get(), + footer_decryptor.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -705,7 +706,7 @@ class FileMetaData::FileMetaDataImpl { encryption::kGcmTagLength)); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer - serializer.Serialize(metadata_.get(), dst, encryptor); + serializer.Serialize(metadata_.get(), dst, encryptor.get()); } } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e62b2d187a20b..6609cff48bac2 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -25,6 +25,7 @@ #include #include +#include "parquet/encryption/type_fwd.h" #include "parquet/platform.h" #include "parquet/properties.h" #include "parquet/schema.h" @@ -34,15 +35,10 @@ namespace parquet { class ColumnDescriptor; class EncodedStatistics; +class FileCryptoMetaData; class Statistics; class SchemaDescriptor; -class FileCryptoMetaData; -class InternalFileDecryptor; -class Decryptor; -class Encryptor; -class FooterSigningEncryptor; - namespace schema { class ColumnPath; diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 9bae90e5540bd..ec99af17f05a1 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -17,6 +17,9 @@ #include "parquet/page_index.h" #include "parquet/encoding.h" +#include "parquet/encryption/encryption_internal.h" +#include "parquet/encryption/internal_file_decryptor.h" +#include "parquet/encryption/internal_file_encryptor.h" #include "parquet/exception.h" #include "parquet/metadata.h" #include "parquet/schema.h" @@ -192,13 +195,13 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { const ReaderProperties& properties, int32_t row_group_ordinal, const RowGroupIndexReadRange& index_read_range, - std::shared_ptr file_decryptor) + InternalFileDecryptor* file_decryptor) : input_(input), row_group_metadata_(std::move(row_group_metadata)), properties_(properties), row_group_ordinal_(row_group_ordinal), index_read_range_(index_read_range), - file_decryptor_(std::move(file_decryptor)) {} + file_decryptor_(file_decryptor) {} /// Read column index of a column chunk. std::shared_ptr GetColumnIndex(int32_t i) override { @@ -207,11 +210,6 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { } auto col_chunk = row_group_metadata_->ColumnChunk(i); - std::unique_ptr crypto_metadata = col_chunk->crypto_metadata(); - if (crypto_metadata != nullptr) { - ParquetException::NYI("Cannot read encrypted column index yet"); - } - auto column_index_location = col_chunk->GetColumnIndexLocation(); if (!column_index_location.has_value()) { return nullptr; @@ -232,8 +230,17 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { // uint32_t uint32_t length = static_cast(column_index_location->length); auto descr = row_group_metadata_->schema()->Column(i); + + // Get decryptor of column index if encrypted. + std::shared_ptr decryptor = parquet::GetColumnMetaDecryptor( + col_chunk->crypto_metadata().get(), file_decryptor_); + if (decryptor != nullptr) { + UpdateDecryptor(decryptor, row_group_ordinal_, /*column_ordinal=*/i, + encryption::kColumnIndex); + } + return ColumnIndex::Make(*descr, column_index_buffer_->data() + buffer_offset, length, - properties_); + properties_, decryptor.get()); } /// Read offset index of a column chunk. @@ -243,11 +250,6 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { } auto col_chunk = row_group_metadata_->ColumnChunk(i); - std::unique_ptr crypto_metadata = col_chunk->crypto_metadata(); - if (crypto_metadata != nullptr) { - ParquetException::NYI("Cannot read encrypted offset index yet"); - } - auto offset_index_location = col_chunk->GetOffsetIndexLocation(); if (!offset_index_location.has_value()) { return nullptr; @@ -267,8 +269,17 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { // OffsetIndex::Make() requires the type of serialized thrift message to be // uint32_t uint32_t length = static_cast(offset_index_location->length); + + // Get decryptor of offset index if encrypted. + std::shared_ptr decryptor = + GetColumnMetaDecryptor(col_chunk->crypto_metadata().get(), file_decryptor_); + if (decryptor != nullptr) { + UpdateDecryptor(decryptor, row_group_ordinal_, /*column_ordinal=*/i, + encryption::kOffsetIndex); + } + return OffsetIndex::Make(offset_index_buffer_->data() + buffer_offset, length, - properties_); + properties_, decryptor.get()); } private: @@ -325,7 +336,7 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { RowGroupIndexReadRange index_read_range_; /// File-level decryptor. - std::shared_ptr file_decryptor_; + InternalFileDecryptor* file_decryptor_; /// Buffer to hold the raw bytes of the page index. /// Will be set lazily when the corresponding page index is accessed for the 1st time. @@ -338,11 +349,11 @@ class PageIndexReaderImpl : public PageIndexReader { PageIndexReaderImpl(::arrow::io::RandomAccessFile* input, std::shared_ptr file_metadata, const ReaderProperties& properties, - std::shared_ptr file_decryptor) + InternalFileDecryptor* file_decryptor) : input_(input), file_metadata_(std::move(file_metadata)), properties_(properties), - file_decryptor_(std::move(file_decryptor)) {} + file_decryptor_(file_decryptor) {} std::shared_ptr RowGroup(int i) override { if (i < 0 || i >= file_metadata_->num_row_groups()) { @@ -418,7 +429,7 @@ class PageIndexReaderImpl : public PageIndexReader { const ReaderProperties& properties_; /// File-level decrypter. - std::shared_ptr file_decryptor_; + InternalFileDecryptor* file_decryptor_; /// Coalesced read ranges of page index of row groups that have been suggested by /// WillNeed(). Key is the row group ordinal. @@ -524,9 +535,9 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.__set_boundary_order(ToThrift(boundary_order)); } - void WriteTo(::arrow::io::OutputStream* sink) const override { + void WriteTo(::arrow::io::OutputStream* sink, Encryptor* encryptor) const override { if (state_ == BuilderState::kFinished) { - ThriftSerializer{}.Serialize(&column_index_, sink); + ThriftSerializer{}.Serialize(&column_index_, sink, encryptor); } } @@ -634,9 +645,9 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder { } } - void WriteTo(::arrow::io::OutputStream* sink) const override { + void WriteTo(::arrow::io::OutputStream* sink, Encryptor* encryptor) const override { if (state_ == BuilderState::kFinished) { - ThriftSerializer{}.Serialize(&offset_index_, sink); + ThriftSerializer{}.Serialize(&offset_index_, sink, encryptor); } } @@ -654,7 +665,9 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder { class PageIndexBuilderImpl final : public PageIndexBuilder { public: - explicit PageIndexBuilderImpl(const SchemaDescriptor* schema) : schema_(schema) {} + explicit PageIndexBuilderImpl(const SchemaDescriptor* schema, + InternalFileEncryptor* file_encryptor) + : schema_(schema), file_encryptor_(file_encryptor) {} void AppendRowGroup() override { if (finished_) { @@ -724,12 +737,31 @@ class PageIndexBuilderImpl final : public PageIndexBuilder { } } + std::shared_ptr GetColumnMetaEncryptor(int row_group_ordinal, + int column_ordinal, + int8_t module_type) const { + std::shared_ptr encryptor; + if (file_encryptor_ != nullptr) { + const auto column_path = schema_->Column(column_ordinal)->path()->ToDotString(); + encryptor = file_encryptor_->GetColumnMetaEncryptor(column_path); + if (encryptor != nullptr) { + encryptor->UpdateAad(encryption::CreateModuleAad( + encryptor->file_aad(), module_type, row_group_ordinal, column_ordinal, + kNonPageOrdinal)); + } + } + return encryptor; + } + template void SerializeIndex( const std::vector>>& page_index_builders, ::arrow::io::OutputStream* sink, std::map>>* location) const { const auto num_columns = static_cast(schema_->num_columns()); + constexpr int8_t module_type = std::is_same_v + ? encryption::kColumnIndex + : encryption::kOffsetIndex; /// Serialize the same kind of page index row group by row group. for (size_t row_group = 0; row_group < page_index_builders.size(); ++row_group) { @@ -743,9 +775,13 @@ class PageIndexBuilderImpl final : public PageIndexBuilder { for (size_t column = 0; column < num_columns; ++column) { const auto& column_page_index_builder = row_group_page_index_builders[column]; if (column_page_index_builder != nullptr) { + /// Get encryptor if encryption is enabled. + std::shared_ptr encryptor = GetColumnMetaEncryptor( + static_cast(row_group), static_cast(column), module_type); + /// Try serializing the page index. PARQUET_ASSIGN_OR_THROW(int64_t pos_before_write, sink->Tell()); - column_page_index_builder->WriteTo(sink); + column_page_index_builder->WriteTo(sink, encryptor.get()); PARQUET_ASSIGN_OR_THROW(int64_t pos_after_write, sink->Tell()); int64_t len = pos_after_write - pos_before_write; @@ -769,6 +805,7 @@ class PageIndexBuilderImpl final : public PageIndexBuilder { } const SchemaDescriptor* schema_; + InternalFileEncryptor* file_encryptor_; std::vector>> column_index_builders_; std::vector>> offset_index_builders_; bool finished_ = false; @@ -832,11 +869,12 @@ RowGroupIndexReadRange PageIndexReader::DeterminePageIndexRangesInRowGroup( std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, const void* serialized_index, uint32_t index_len, - const ReaderProperties& properties) { + const ReaderProperties& properties, + Decryptor* decryptor) { format::ColumnIndex column_index; ThriftDeserializer deserializer(properties); deserializer.DeserializeMessage(reinterpret_cast(serialized_index), - &index_len, &column_index); + &index_len, &column_index, decryptor); switch (descr.physical_type()) { case Type::BOOLEAN: return std::make_unique>(descr, @@ -871,20 +909,20 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, std::unique_ptr OffsetIndex::Make(const void* serialized_index, uint32_t index_len, - const ReaderProperties& properties) { + const ReaderProperties& properties, + Decryptor* decryptor) { format::OffsetIndex offset_index; ThriftDeserializer deserializer(properties); deserializer.DeserializeMessage(reinterpret_cast(serialized_index), - &index_len, &offset_index); + &index_len, &offset_index, decryptor); return std::make_unique(offset_index); } std::shared_ptr PageIndexReader::Make( ::arrow::io::RandomAccessFile* input, std::shared_ptr file_metadata, - const ReaderProperties& properties, - std::shared_ptr file_decryptor) { + const ReaderProperties& properties, InternalFileDecryptor* file_decryptor) { return std::make_shared(input, std::move(file_metadata), - properties, std::move(file_decryptor)); + properties, file_decryptor); } std::unique_ptr ColumnIndexBuilder::Make( @@ -917,8 +955,9 @@ std::unique_ptr OffsetIndexBuilder::Make() { return std::make_unique(); } -std::unique_ptr PageIndexBuilder::Make(const SchemaDescriptor* schema) { - return std::make_unique(schema); +std::unique_ptr PageIndexBuilder::Make( + const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor) { + return std::make_unique(schema, file_encryptor); } std::ostream& operator<<(std::ostream& out, const PageIndexSelection& selection) { diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index b6ea5fd6abc08..f2ed77cb97c3b 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -18,6 +18,7 @@ #pragma once #include "arrow/io/interfaces.h" +#include "parquet/encryption/type_fwd.h" #include "parquet/types.h" #include @@ -25,14 +26,8 @@ namespace parquet { -class ColumnDescriptor; class EncodedStatistics; -class FileMetaData; -class InternalFileDecryptor; struct PageIndexLocation; -class ReaderProperties; -class RowGroupMetaData; -class RowGroupPageIndexReader; /// \brief ColumnIndex is a proxy around format::ColumnIndex. class PARQUET_EXPORT ColumnIndex { @@ -41,7 +36,8 @@ class PARQUET_EXPORT ColumnIndex { static std::unique_ptr Make(const ColumnDescriptor& descr, const void* serialized_index, uint32_t index_len, - const ReaderProperties& properties); + const ReaderProperties& properties, + Decryptor* decryptor = NULLPTR); virtual ~ColumnIndex() = default; @@ -126,7 +122,8 @@ class PARQUET_EXPORT OffsetIndex { /// \brief Create a OffsetIndex from a serialized thrift message. static std::unique_ptr Make(const void* serialized_index, uint32_t index_len, - const ReaderProperties& properties); + const ReaderProperties& properties, + Decryptor* decryptor = NULLPTR); virtual ~OffsetIndex() = default; @@ -187,7 +184,7 @@ class PARQUET_EXPORT PageIndexReader { static std::shared_ptr Make( ::arrow::io::RandomAccessFile* input, std::shared_ptr file_metadata, const ReaderProperties& properties, - std::shared_ptr file_decryptor = NULLPTR); + InternalFileDecryptor* file_decryptor = NULLPTR); /// \brief Get the page index reader of a specific row group. /// \param[in] i row group ordinal to get page index reader. @@ -283,7 +280,9 @@ class PARQUET_EXPORT ColumnIndexBuilder { /// not write any data to the sink. /// /// \param[out] sink output stream to write the serialized message. - virtual void WriteTo(::arrow::io::OutputStream* sink) const = 0; + /// \param[in] encryptor encryptor to encrypt the serialized column index. + virtual void WriteTo(::arrow::io::OutputStream* sink, + Encryptor* encryptor = NULLPTR) const = 0; /// \brief Create a ColumnIndex directly. /// @@ -322,7 +321,9 @@ class PARQUET_EXPORT OffsetIndexBuilder { /// \brief Serialize the offset index thrift message. /// /// \param[out] sink output stream to write the serialized message. - virtual void WriteTo(::arrow::io::OutputStream* sink) const = 0; + /// \param[in] encryptor encryptor to encrypt the serialized offset index. + virtual void WriteTo(::arrow::io::OutputStream* sink, + Encryptor* encryptor = NULLPTR) const = 0; /// \brief Create an OffsetIndex directly. virtual std::unique_ptr Build() const = 0; @@ -332,7 +333,8 @@ class PARQUET_EXPORT OffsetIndexBuilder { class PARQUET_EXPORT PageIndexBuilder { public: /// \brief API convenience to create a PageIndexBuilder. - static std::unique_ptr Make(const SchemaDescriptor* schema); + static std::unique_ptr Make( + const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor = NULLPTR); virtual ~PageIndexBuilder() = default; diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 5824a82d5b86d..7491f118d32a0 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -403,7 +403,7 @@ class ThriftDeserializer { // set to the actual length of the header. template void DeserializeMessage(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& decryptor = NULLPTR) { + Decryptor* decryptor = NULLPTR) { if (decryptor == NULLPTR) { // thrift message is not encrypted DeserializeUnencryptedMessage(buf, len, deserialized_msg); @@ -495,7 +495,7 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryptor = NULLPTR) { + Encryptor* encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -523,8 +523,7 @@ class ThriftSerializer { } int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, - uint32_t out_length, - const std::shared_ptr& encryptor) { + uint32_t out_length, Encryptor* encryptor) { auto cipher_buffer = std::static_pointer_cast(AllocateBuffer( encryptor->pool(), static_cast(encryptor->CiphertextSizeDelta() + out_length))); diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index 3e66f32fc0322..da0d0f7bdee96 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -69,6 +69,9 @@ struct ParquetVersion { }; class FileMetaData; +class RowGroupMetaData; + +class ColumnDescriptor; class SchemaDescriptor; class ReaderProperties; diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index a453b0807267f..0838134b19c6d 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -62,16 +62,7 @@ internal static void Accept(T array, IArrowArrayVisitor visitor) public Array Slice(int offset, int length) { - if (offset > Length) - { - throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for Array.Slice"); - } - - length = Math.Min(Data.Length - offset, length); - offset += Data.Offset; - - ArrayData newData = Data.Slice(offset, length); - return ArrowArrayFactory.BuildArray(newData) as Array; + return ArrowArrayFactory.Slice(this, offset, length) as Array; } public void Dispose() @@ -88,4 +79,4 @@ protected virtual void Dispose(bool disposing) } } } -} \ No newline at end of file +} diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 8859ecd7f05b9..806defdc7ce66 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -49,7 +49,8 @@ private class ArrayDataConcatenationVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, - IArrowTypeVisitor + IArrowTypeVisitor, + IArrowTypeVisitor { public ArrayData Result { get; private set; } private readonly IReadOnlyList _arrayDataList; @@ -123,6 +124,33 @@ public void Visit(StructType type) Result = new ArrayData(type, _arrayDataList[0].Length, _arrayDataList[0].NullCount, 0, _arrayDataList[0].Buffers, children); } + public void Visit(UnionType type) + { + int bufferCount = type.Mode switch + { + UnionMode.Sparse => 1, + UnionMode.Dense => 2, + _ => throw new InvalidOperationException("TODO"), + }; + + CheckData(type, bufferCount); + List children = new List(type.Fields.Count); + + for (int i = 0; i < type.Fields.Count; i++) + { + children.Add(Concatenate(SelectChildren(i), _allocator)); + } + + ArrowBuffer[] buffers = new ArrowBuffer[bufferCount]; + buffers[0] = ConcatenateUnionTypeBuffer(); + if (bufferCount > 1) + { + buffers[1] = ConcatenateUnionOffsetBuffer(); + } + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, buffers, children); + } + public void Visit(IArrowType type) { throw new NotImplementedException($"Concatenation for {type.Name} is not supported yet."); @@ -231,6 +259,38 @@ private ArrowBuffer ConcatenateOffsetBuffer() return builder.Build(_allocator); } + private ArrowBuffer ConcatenateUnionTypeBuffer() + { + var builder = new ArrowBuffer.Builder(_totalLength); + + foreach (ArrayData arrayData in _arrayDataList) + { + builder.Append(arrayData.Buffers[0]); + } + + return builder.Build(_allocator); + } + + private ArrowBuffer ConcatenateUnionOffsetBuffer() + { + var builder = new ArrowBuffer.Builder(_totalLength); + int baseOffset = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo(); + foreach (int offset in span) + { + builder.Append(baseOffset + offset); + } + + // The next offset must start from the current last offset. + baseOffset += span[arrayData.Length]; + } + + return builder.Build(_allocator); + } + private List SelectChildren(int index) { var children = new List(_arrayDataList.Count); diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataTypeComparer.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataTypeComparer.cs index 8a6bfed29abb6..6b54ec1edb573 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataTypeComparer.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataTypeComparer.cs @@ -27,7 +27,8 @@ internal sealed class ArrayDataTypeComparer : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, - IArrowTypeVisitor + IArrowTypeVisitor, + IArrowTypeVisitor { private readonly IArrowType _expectedType; private bool _dataTypeMatch; @@ -122,6 +123,15 @@ public void Visit(StructType actualType) } } + public void Visit(UnionType actualType) + { + if (_expectedType is UnionType expectedType + && CompareNested(expectedType, actualType)) + { + _dataTypeMatch = true; + } + } + private static bool CompareNested(NestedType expectedType, NestedType actualType) { if (expectedType.Fields.Count != actualType.Fields.Count) diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index f82037bff47b1..aa407203d1858 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -62,7 +62,7 @@ public static IArrowArray BuildArray(ArrayData data) case ArrowTypeId.Struct: return new StructArray(data); case ArrowTypeId.Union: - return new UnionArray(data); + return UnionArray.Create(data); case ArrowTypeId.Date64: return new Date64Array(data); case ArrowTypeId.Date32: @@ -91,5 +91,19 @@ public static IArrowArray BuildArray(ArrayData data) throw new NotSupportedException($"An ArrowArray cannot be built for type {data.DataType.TypeId}."); } } + + public static IArrowArray Slice(IArrowArray array, int offset, int length) + { + if (offset > array.Length) + { + throw new ArgumentException($"Offset {offset} cannot be greater than Length {array.Length} for Array.Slice"); + } + + length = Math.Min(array.Data.Length - offset, length); + offset += array.Data.Offset; + + ArrayData newData = array.Data.Slice(offset, length); + return BuildArray(newData); + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs new file mode 100644 index 0000000000000..1aacbe11f08b9 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Apache.Arrow +{ + public class DenseUnionArray : UnionArray + { + public ArrowBuffer ValueOffsetBuffer => Data.Buffers[1]; + + public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo(); + + public DenseUnionArray( + IArrowType dataType, + int length, + IEnumerable children, + ArrowBuffer typeIds, + ArrowBuffer valuesOffsetBuffer, + int nullCount = 0, + int offset = 0) + : base(new ArrayData( + dataType, length, nullCount, offset, new[] { typeIds, valuesOffsetBuffer }, + children.Select(child => child.Data))) + { + _fields = children.ToArray(); + ValidateMode(UnionMode.Dense, Type.Mode); + } + + public DenseUnionArray(ArrayData data) + : base(data) + { + ValidateMode(UnionMode.Dense, Type.Mode); + data.EnsureBufferCount(2); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs index a50d4b52c3257..67fe46633c18f 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs @@ -137,6 +137,9 @@ public TBuilder Append(T value) return Instance; } + public TBuilder Append(T? value) => + (value == null) ? AppendNull() : Append(value.Value); + public TBuilder Append(ReadOnlySpan span) { int len = ValueBuffer.Length; diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs new file mode 100644 index 0000000000000..b79c44c979e47 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System.Collections.Generic; +using System.Linq; + +namespace Apache.Arrow +{ + public class SparseUnionArray : UnionArray + { + public SparseUnionArray( + IArrowType dataType, + int length, + IEnumerable children, + ArrowBuffer typeIds, + int nullCount = 0, + int offset = 0) + : base(new ArrayData( + dataType, length, nullCount, offset, new[] { typeIds }, + children.Select(child => child.Data))) + { + _fields = children.ToArray(); + ValidateMode(UnionMode.Sparse, Type.Mode); + } + + public SparseUnionArray(ArrayData data) + : base(data) + { + ValidateMode(UnionMode.Sparse, Type.Mode); + data.EnsureBufferCount(1); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index 0269768f490bb..0dc5726d01734 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -76,7 +76,7 @@ protected override long ConvertTo(DateTimeOffset value) switch (DataType.Unit) { case TimeUnit.Nanosecond: - return ticks * 100; + return checked(ticks * 100); case TimeUnit.Microsecond: return ticks / 10; case TimeUnit.Millisecond: diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 8bccea2b59e31..0a7ae288fd0c5 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -15,37 +15,88 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; +using System.Threading; namespace Apache.Arrow { - public class UnionArray: Array + public abstract class UnionArray : IArrowArray { - public UnionType Type => Data.DataType as UnionType; + protected IReadOnlyList _fields; - public UnionMode Mode => Type.Mode; + public IReadOnlyList Fields => + LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + + public ArrayData Data { get; } - public ArrowBuffer TypeBuffer => Data.Buffers[1]; + public UnionType Type => (UnionType)Data.DataType; - public ArrowBuffer ValueOffsetBuffer => Data.Buffers[2]; + public UnionMode Mode => Type.Mode; + + public ArrowBuffer TypeBuffer => Data.Buffers[0]; public ReadOnlySpan TypeIds => TypeBuffer.Span; - public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo().Slice(0, Length + 1); + public int Length => Data.Length; + + public int Offset => Data.Offset; - public UnionArray(ArrayData data) - : base(data) + public int NullCount => Data.NullCount; + + public bool IsValid(int index) => NullCount == 0 || Fields[TypeIds[index]].IsValid(index); + + public bool IsNull(int index) => !IsValid(index); + + protected UnionArray(ArrayData data) { + Data = data; data.EnsureDataType(ArrowTypeId.Union); - data.EnsureBufferCount(3); } - public IArrowArray GetChild(int index) + public static UnionArray Create(ArrayData data) { - // TODO: Implement - throw new NotImplementedException(); + return ((UnionType)data.DataType).Mode switch + { + UnionMode.Dense => new DenseUnionArray(data), + UnionMode.Sparse => new SparseUnionArray(data), + _ => throw new InvalidOperationException("unknown union mode in array creation") + }; } - public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + public void Accept(IArrowArrayVisitor visitor) => Array.Accept(this, visitor); + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + Data.Dispose(); + } + } + + protected static void ValidateMode(UnionMode expected, UnionMode actual) + { + if (expected != actual) + { + throw new ArgumentException( + $"Specified union mode <{actual}> does not match expected mode <{expected}>", + "Mode"); + } + } + + private IReadOnlyList InitializeFields() + { + IArrowArray[] result = new IArrowArray[Data.Children.Length]; + for (int i = 0; i < Data.Children.Length; i++) + { + result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + } + return result; + } } } diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.cs b/csharp/src/Apache.Arrow/ArrowBuffer.cs index dbd97fc3aec9e..ef98bdc853b88 100644 --- a/csharp/src/Apache.Arrow/ArrowBuffer.cs +++ b/csharp/src/Apache.Arrow/ArrowBuffer.cs @@ -75,8 +75,9 @@ public void Dispose() internal bool TryExport(ExportedAllocationOwner newOwner, out IntPtr ptr) { - if (_memoryOwner == null && IsEmpty) + if (IsEmpty) { + // _memoryOwner could be anything (for example null or a NullMemoryOwner), but it doesn't matter here ptr = IntPtr.Zero; return true; } diff --git a/csharp/src/Apache.Arrow/C/CArrowArray.cs b/csharp/src/Apache.Arrow/C/CArrowArray.cs index fc609f10fdfa5..882ca8caa503d 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArray.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArray.cs @@ -16,6 +16,7 @@ // under the License. using System; +using System.Diagnostics; using System.Runtime.InteropServices; namespace Apache.Arrow.C @@ -67,6 +68,15 @@ public unsafe struct CArrowArray /// Do not call this on a pointer that was allocated elsewhere. /// public static void Free(CArrowArray* array) + { + CallReleaseFunc(array); + Marshal.FreeHGlobal((IntPtr)array); + } + + /// + /// Call the array's release func, if set. + /// + public static void CallReleaseFunc(CArrowArray* array) { if (array->release != default) { @@ -76,8 +86,9 @@ public static void Free(CArrowArray* array) #else Marshal.GetDelegateForFunctionPointer(array->release)(array); #endif + Debug.Assert(array->release == default, + "Calling the CArrowArray release func should have set it to NULL"); } - Marshal.FreeHGlobal((IntPtr)array); } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 16aaa3874b370..2d9febea33f54 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -15,6 +15,7 @@ using System; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Apache.Arrow.Memory; @@ -59,8 +60,6 @@ public static unsafe void ExportArray(IArrowArray array, CArrowArray* cArray) try { ConvertArray(allocationOwner, array.Data, cArray); - cArray->release = ReleaseArrayPtr; - cArray->private_data = FromDisposable(allocationOwner); allocationOwner = null; } finally @@ -102,8 +101,6 @@ public static unsafe void ExportRecordBatch(RecordBatch batch, CArrowArray* cArr try { ConvertRecordBatch(allocationOwner, batch, cArray); - cArray->release = ReleaseArrayPtr; - cArray->private_data = FromDisposable(allocationOwner); allocationOwner = null; } finally @@ -118,7 +115,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->offset = array.Offset; cArray->null_count = array.NullCount; cArray->release = ReleaseArrayPtr; - cArray->private_data = null; + cArray->private_data = MakePrivateData(sharedOwner); cArray->n_buffers = array.Buffers?.Length ?? 0; cArray->buffers = null; @@ -131,7 +128,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr IntPtr ptr; if (!buffer.TryExport(sharedOwner, out ptr)) { - throw new NotSupportedException($"An ArrowArray of type {array.DataType.TypeId} could not be exported"); + throw new NotSupportedException($"An ArrowArray of type {array.DataType.TypeId} could not be exported: failed on buffer #{i}"); } cArray->buffers[i] = (byte*)ptr; } @@ -144,7 +141,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->children = (CArrowArray**)sharedOwner.Allocate(IntPtr.Size * array.Children.Length); for (int i = 0; i < array.Children.Length; i++) { - cArray->children[i] = CArrowArray.Create(); + cArray->children[i] = MakeArray(sharedOwner); ConvertArray(sharedOwner, array.Children[i], cArray->children[i]); } } @@ -152,7 +149,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->dictionary = null; if (array.Dictionary != null) { - cArray->dictionary = CArrowArray.Create(); + cArray->dictionary = MakeArray(sharedOwner); ConvertArray(sharedOwner, array.Dictionary, cArray->dictionary); } } @@ -163,20 +160,24 @@ private unsafe static void ConvertRecordBatch(ExportedAllocationOwner sharedOwne cArray->offset = 0; cArray->null_count = 0; cArray->release = ReleaseArrayPtr; - cArray->private_data = null; + cArray->private_data = MakePrivateData(sharedOwner); cArray->n_buffers = 1; cArray->buffers = (byte**)sharedOwner.Allocate(IntPtr.Size); cArray->n_children = batch.ColumnCount; cArray->children = null; + // XXX sharing the same ExportedAllocationOwner for all columns + // and child arrays makes memory tracking inflexible. + // If the consumer keeps only a single record batch column, + // the entire record batch memory is nevertheless kept alive. if (cArray->n_children > 0) { cArray->children = (CArrowArray**)sharedOwner.Allocate(IntPtr.Size * batch.ColumnCount); int i = 0; foreach (IArrowArray child in batch.Arrays) { - cArray->children[i] = CArrowArray.Create(); + cArray->children[i] = MakeArray(sharedOwner); ConvertArray(sharedOwner, child.Data, cArray->children[i]); i++; } @@ -190,26 +191,44 @@ private unsafe static void ConvertRecordBatch(ExportedAllocationOwner sharedOwne #endif private unsafe static void ReleaseArray(CArrowArray* cArray) { - Dispose(&cArray->private_data); + for (long i = 0; i < cArray->n_children; i++) + { + CArrowArray.CallReleaseFunc(cArray->children[i]); + } + if (cArray->dictionary != null) + { + CArrowArray.CallReleaseFunc(cArray->dictionary); + } + DisposePrivateData(&cArray->private_data); cArray->release = default; } - private unsafe static void* FromDisposable(IDisposable disposable) + private unsafe static CArrowArray* MakeArray(ExportedAllocationOwner sharedOwner) + { + var array = (CArrowArray*)sharedOwner.Allocate(sizeof(CArrowArray)); + *array = default; + return array; + } + + private unsafe static void* MakePrivateData(ExportedAllocationOwner sharedOwner) { - GCHandle gch = GCHandle.Alloc(disposable); + GCHandle gch = GCHandle.Alloc(sharedOwner); + sharedOwner.IncRef(); return (void*)GCHandle.ToIntPtr(gch); } - private unsafe static void Dispose(void** ptr) + private unsafe static void DisposePrivateData(void** ptr) { - GCHandle gch = GCHandle.FromIntPtr((IntPtr)(*ptr)); + GCHandle gch = GCHandle.FromIntPtr((IntPtr) (*ptr)); if (!gch.IsAllocated) { return; } - ((IDisposable)gch.Target).Dispose(); + // We can't call IDisposable.Dispose() here as we create multiple + // GCHandles to the same object. Instead, refcounting ensures + // timely memory deallocation when all GCHandles are freed. + ((ExportedAllocationOwner) gch.Target).DecRef(); gch.Free(); - *ptr = null; } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 9b7bcb7abe5a5..da1b0f31b8f08 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -170,6 +170,15 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) buffers = new ArrowBuffer[] { ImportValidityBuffer(cArray) }; break; case ArrowTypeId.Union: + UnionType unionType = (UnionType)type; + children = ProcessStructChildren(cArray, unionType.Fields); + buffers = unionType.Mode switch + { + UnionMode.Dense => ImportDenseUnionBuffers(cArray), + UnionMode.Sparse => ImportSparseUnionBuffers(cArray), + _ => throw new InvalidOperationException("unknown union mode in import") + }; ; + break; case ArrowTypeId.Map: break; case ArrowTypeId.Null: @@ -286,6 +295,35 @@ private ArrowBuffer[] ImportFixedSizeListBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 2) + { + throw new InvalidOperationException("Dense union arrays are expected to have exactly two children"); + } + int length = checked((int)cArray->length); + int offsetsLength = length * 4; + + ArrowBuffer[] buffers = new ArrowBuffer[2]; + buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, length)); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + + return buffers; + } + + private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 1) + { + throw new InvalidOperationException("Sparse union arrays are expected to have exactly one child"); + } + + ArrowBuffer[] buffers = new ArrowBuffer[1]; + buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, checked((int)cArray->length))); + + return buffers; + } + private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth) { if (cArray->n_buffers != 2) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index 66142da331ac8..c1a12362a942a 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -124,6 +124,23 @@ public static unsafe void ExportSchema(Schema schema, CArrowSchema* out_schema) _ => throw new InvalidDataException($"Unsupported time unit for export: {unit}"), }; + private static string FormatUnion(UnionType unionType) + { + StringBuilder builder = new StringBuilder(); + builder.Append(unionType.Mode switch + { + UnionMode.Sparse => "+us:", + UnionMode.Dense => "+ud:", + _ => throw new InvalidDataException($"Unsupported union mode for export: {unionType.Mode}"), + }); + for (int i = 0; i < unionType.TypeIds.Length; i++) + { + if (i > 0) { builder.Append(','); } + builder.Append(unionType.TypeIds[i]); + } + return builder.ToString(); + } + private static string GetFormat(IArrowType datatype) { switch (datatype) @@ -170,6 +187,7 @@ private static string GetFormat(IArrowType datatype) case FixedSizeListType fixedListType: return $"+w:{fixedListType.ListSize}"; case StructType _: return "+s"; + case UnionType u: return FormatUnion(u); // Dictionary case DictionaryType dictionaryType: return GetFormat(dictionaryType.IndexType); diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 2a750d5e8250d..f7216df869abd 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -37,7 +37,7 @@ public static class CArrowSchemaImporter /// Typically, you will allocate an uninitialized CArrowSchema pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowSchema* importedPtr = CArrowSchema.Create(); /// foreign_export_function(importedPtr); @@ -62,7 +62,7 @@ public static unsafe ArrowType ImportType(CArrowSchema* ptr) /// Typically, you will allocate an uninitialized CArrowSchema pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowSchema* importedPtr = CArrowSchema.Create(); /// foreign_export_function(importedPtr); @@ -87,7 +87,7 @@ public static unsafe Field ImportField(CArrowSchema* ptr) /// Typically, you will allocate an uninitialized CArrowSchema pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowSchema* importedPtr = CArrowSchema.Create(); /// foreign_export_function(importedPtr); @@ -184,21 +184,7 @@ public ArrowType GetAsType() } else if (format == "+s") { - var child_schemas = new ImportedArrowSchema[_cSchema->n_children]; - - for (int i = 0; i < _cSchema->n_children; i++) - { - if (_cSchema->GetChild(i) == null) - { - throw new InvalidDataException("Expected struct type child to be non-null."); - } - child_schemas[i] = new ImportedArrowSchema(_cSchema->GetChild(i), isRoot: false); - } - - - List childFields = child_schemas.Select(schema => schema.GetAsField()).ToList(); - - return new StructType(childFields); + return new StructType(ParseChildren("struct")); } else if (format.StartsWith("+w:")) { @@ -255,6 +241,10 @@ public ArrowType GetAsType() }; string timezone = format.Substring(format.IndexOf(':') + 1); + if (timezone.Length == 0) + { + timezone = null; + } return new TimestampType(timeUnit, timezone); } @@ -265,6 +255,30 @@ public ArrowType GetAsType() return new FixedSizeBinaryType(width); } + // Unions + if (format.StartsWith("+ud:") || format.StartsWith("+us:")) + { + UnionMode unionMode = format[2] == 'd' ? UnionMode.Dense : UnionMode.Sparse; + List typeIds = new List(); + int pos = 4; + do + { + int next = format.IndexOf(',', pos); + if (next < 0) { next = format.Length; } + + int code; + if (!int.TryParse(format.Substring(pos, next - pos), out code)) + { + throw new InvalidDataException($"Invalid type code for union import: {format.Substring(pos, next - pos)}"); + } + typeIds.Add(code); + + pos = next + 1; + } while (pos < format.Length); + + return new UnionType(ParseChildren("union"), typeIds, unionMode); + } + return format switch { // Primitives @@ -324,6 +338,22 @@ public Schema GetAsSchema() } } + private List ParseChildren(string typeName) + { + var child_schemas = new ImportedArrowSchema[_cSchema->n_children]; + + for (int i = 0; i < _cSchema->n_children; i++) + { + if (_cSchema->GetChild(i) == null) + { + throw new InvalidDataException($"Expected {typeName} type child to be non-null."); + } + child_schemas[i] = new ImportedArrowSchema(_cSchema->GetChild(i), isRoot: false); + } + + return child_schemas.Select(schema => schema.GetAsField()).ToList(); + } + private unsafe static IReadOnlyDictionary GetMetadata(byte* metadata) { if (metadata == null) diff --git a/csharp/src/Apache.Arrow/ChunkedArray.cs b/csharp/src/Apache.Arrow/ChunkedArray.cs index 5f25acfe04a2f..f5909f5adfe48 100644 --- a/csharp/src/Apache.Arrow/ChunkedArray.cs +++ b/csharp/src/Apache.Arrow/ChunkedArray.cs @@ -15,7 +15,6 @@ using System; using System.Collections.Generic; -using Apache.Arrow; using Apache.Arrow.Types; namespace Apache.Arrow @@ -25,7 +24,7 @@ namespace Apache.Arrow /// public class ChunkedArray { - private IList Arrays { get; } + private IList Arrays { get; } public IArrowType DataType { get; } public long Length { get; } public long NullCount { get; } @@ -35,9 +34,16 @@ public int ArrayCount get => Arrays.Count; } - public Array Array(int index) => Arrays[index]; + public Array Array(int index) => Arrays[index] as Array; + + public IArrowArray ArrowArray(int index) => Arrays[index]; public ChunkedArray(IList arrays) + : this(Cast(arrays)) + { + } + + public ChunkedArray(IList arrays) { Arrays = arrays ?? throw new ArgumentNullException(nameof(arrays)); if (arrays.Count < 1) @@ -45,14 +51,14 @@ public ChunkedArray(IList arrays) throw new ArgumentException($"Count must be at least 1. Got {arrays.Count} instead"); } DataType = arrays[0].Data.DataType; - foreach (Array array in arrays) + foreach (IArrowArray array in arrays) { Length += array.Length; NullCount += array.NullCount; } } - public ChunkedArray(Array array) : this(new[] { array }) { } + public ChunkedArray(Array array) : this(new IArrowArray[] { array }) { } public ChunkedArray Slice(long offset, long length) { @@ -69,10 +75,10 @@ public ChunkedArray Slice(long offset, long length) curArrayIndex++; } - IList newArrays = new List(); + IList newArrays = new List(); while (curArrayIndex < numArrays && length > 0) { - newArrays.Add(Arrays[curArrayIndex].Slice((int)offset, + newArrays.Add(ArrowArrayFactory.Slice(Arrays[curArrayIndex], (int)offset, length > Arrays[curArrayIndex].Length ? Arrays[curArrayIndex].Length : (int)length)); length -= Arrays[curArrayIndex].Length - offset; offset = 0; @@ -86,6 +92,16 @@ public ChunkedArray Slice(long offset) return Slice(offset, Length - offset); } + private static IArrowArray[] Cast(IList arrays) + { + IArrowArray[] arrowArrays = new IArrowArray[arrays.Count]; + for (int i = 0; i < arrays.Count; i++) + { + arrowArrays[i] = arrays[i]; + } + return arrowArrays; + } + // TODO: Flatten for Structs } } diff --git a/csharp/src/Apache.Arrow/Column.cs b/csharp/src/Apache.Arrow/Column.cs index 4eaf9a559e75d..0709b9142cafd 100644 --- a/csharp/src/Apache.Arrow/Column.cs +++ b/csharp/src/Apache.Arrow/Column.cs @@ -28,19 +28,23 @@ public class Column public ChunkedArray Data { get; } public Column(Field field, IList arrays) + : this(field, new ChunkedArray(arrays), doValidation: true) + { + } + + public Column(Field field, IList arrays) + : this(field, new ChunkedArray(arrays), doValidation: true) { - Data = new ChunkedArray(arrays); - Field = field; - if (!ValidateArrayDataTypes()) - { - throw new ArgumentException($"{Field.DataType} must match {Data.DataType}"); - } } - private Column(Field field, ChunkedArray arrays) + private Column(Field field, ChunkedArray data, bool doValidation = false) { + Data = data; Field = field; - Data = arrays; + if (doValidation && !ValidateArrayDataTypes()) + { + throw new ArgumentException($"{Field.DataType} must match {Data.DataType}"); + } } public long Length => Data.Length; @@ -64,12 +68,12 @@ private bool ValidateArrayDataTypes() for (int i = 0; i < Data.ArrayCount; i++) { - if (Data.Array(i).Data.DataType.TypeId != Field.DataType.TypeId) + if (Data.ArrowArray(i).Data.DataType.TypeId != Field.DataType.TypeId) { return false; } - Data.Array(i).Data.DataType.Accept(dataTypeComparer); + Data.ArrowArray(i).Data.DataType.Accept(dataTypeComparer); if (!dataTypeComparer.DataTypeMatch) { diff --git a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs index d2a70bca9e4ec..35c5b3e55157d 100644 --- a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs @@ -80,6 +80,16 @@ public static Types.TimeUnit ToArrow(this Flatbuf.TimeUnit unit) throw new ArgumentException($"Unexpected Flatbuf TimeUnit", nameof(unit)); } } + + public static Types.UnionMode ToArrow(this Flatbuf.UnionMode mode) + { + return mode switch + { + Flatbuf.UnionMode.Dense => Types.UnionMode.Dense, + Flatbuf.UnionMode.Sparse => Types.UnionMode.Sparse, + _ => throw new ArgumentException($"Unsupported Flatbuf UnionMode", nameof(mode)), + }; + } } } diff --git a/csharp/src/Apache.Arrow/Interfaces/IArrowArray.cs b/csharp/src/Apache.Arrow/Interfaces/IArrowArray.cs index 50fbc3af6dd72..9bcee36ef4eaf 100644 --- a/csharp/src/Apache.Arrow/Interfaces/IArrowArray.cs +++ b/csharp/src/Apache.Arrow/Interfaces/IArrowArray.cs @@ -32,9 +32,5 @@ public interface IArrowArray : IDisposable ArrayData Data { get; } void Accept(IArrowArrayVisitor visitor); - - //IArrowArray Slice(int offset); - - //IArrowArray Slice(int offset, int length); } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index c9c1b21673316..d3115da52cc6c 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -116,11 +116,11 @@ protected RecordBatch CreateArrowObjectFromMessage( break; case Flatbuf.MessageHeader.DictionaryBatch: Flatbuf.DictionaryBatch dictionaryBatch = message.Header().Value; - ReadDictionaryBatch(dictionaryBatch, bodyByteBuffer, memoryOwner); + ReadDictionaryBatch(message.Version, dictionaryBatch, bodyByteBuffer, memoryOwner); break; case Flatbuf.MessageHeader.RecordBatch: Flatbuf.RecordBatch rb = message.Header().Value; - List arrays = BuildArrays(Schema, bodyByteBuffer, rb); + List arrays = BuildArrays(message.Version, Schema, bodyByteBuffer, rb); return new RecordBatch(Schema, memoryOwner, arrays, (int)rb.Length); default: // NOTE: Skip unsupported message type @@ -136,7 +136,11 @@ internal static ByteBuffer CreateByteBuffer(ReadOnlyMemory buffer) return new ByteBuffer(new ReadOnlyMemoryBufferAllocator(buffer), 0); } - private void ReadDictionaryBatch(Flatbuf.DictionaryBatch dictionaryBatch, ByteBuffer bodyByteBuffer, IMemoryOwner memoryOwner) + private void ReadDictionaryBatch( + MetadataVersion version, + Flatbuf.DictionaryBatch dictionaryBatch, + ByteBuffer bodyByteBuffer, + IMemoryOwner memoryOwner) { long id = dictionaryBatch.Id; IArrowType valueType = DictionaryMemo.GetDictionaryType(id); @@ -149,7 +153,7 @@ private void ReadDictionaryBatch(Flatbuf.DictionaryBatch dictionaryBatch, ByteBu Field valueField = new Field("dummy", valueType, true); var schema = new Schema(new[] { valueField }, default); - IList arrays = BuildArrays(schema, bodyByteBuffer, recordBatch.Value); + IList arrays = BuildArrays(version, schema, bodyByteBuffer, recordBatch.Value); if (arrays.Count != 1) { @@ -167,6 +171,7 @@ private void ReadDictionaryBatch(Flatbuf.DictionaryBatch dictionaryBatch, ByteBu } private List BuildArrays( + MetadataVersion version, Schema schema, ByteBuffer messageBuffer, Flatbuf.RecordBatch recordBatchMessage) @@ -187,8 +192,8 @@ private List BuildArrays( Flatbuf.FieldNode fieldNode = recordBatchEnumerator.CurrentNode; ArrayData arrayData = field.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator) - : LoadVariableField(ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); + ? LoadPrimitiveField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator) + : LoadVariableField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); arrays.Add(ArrowArrayFactory.BuildArray(arrayData)); } while (recordBatchEnumerator.MoveNextNode()); @@ -225,6 +230,7 @@ private IBufferCreator GetBufferCreator(BodyCompression? compression) } private ArrayData LoadPrimitiveField( + MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, in Flatbuf.FieldNode fieldNode, @@ -245,31 +251,44 @@ private ArrayData LoadPrimitiveField( throw new InvalidDataException("Null count length must be >= 0"); // TODO:Localize exception message } - if (field.DataType.TypeId == ArrowTypeId.Null) + int buffers; + switch (field.DataType.TypeId) { - return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, System.Array.Empty()); - } - - ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); + case ArrowTypeId.Null: + return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, System.Array.Empty()); + case ArrowTypeId.Union: + if (version < MetadataVersion.V5) + { + if (fieldNullCount > 0) + { + if (recordBatchEnumerator.CurrentBuffer.Length > 0) + { + // With older metadata we can get a validity bitmap. Fixing up union data is hard, + // so we will just quit. + throw new NotSupportedException("Cannot read pre-1.0.0 Union array with top-level validity bitmap"); + } + } + recordBatchEnumerator.MoveNextBuffer(); + } + buffers = ((UnionType)field.DataType).Mode == Types.UnionMode.Dense ? 2 : 1; + break; + case ArrowTypeId.Struct: + case ArrowTypeId.FixedSizeList: + buffers = 1; + break; + default: + buffers = 2; + break; } - ArrowBuffer[] arrowBuff; - if (field.DataType.TypeId == ArrowTypeId.Struct || field.DataType.TypeId == ArrowTypeId.FixedSizeList) + ArrowBuffer[] arrowBuff = new ArrowBuffer[buffers]; + for (int i = 0; i < buffers; i++) { - arrowBuff = new[] { nullArrowBuffer }; - } - else - { - ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); + arrowBuff[i] = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); recordBatchEnumerator.MoveNextBuffer(); - - arrowBuff = new[] { nullArrowBuffer, valueArrowBuffer }; } - ArrayData[] children = GetChildren(ref recordBatchEnumerator, field, bodyData, bufferCreator); + ArrayData[] children = GetChildren(version, ref recordBatchEnumerator, field, bodyData, bufferCreator); IArrowArray dictionary = null; if (field.DataType.TypeId == ArrowTypeId.Dictionary) @@ -282,6 +301,7 @@ private ArrayData LoadPrimitiveField( } private ArrayData LoadVariableField( + MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, in Flatbuf.FieldNode fieldNode, @@ -316,7 +336,7 @@ private ArrayData LoadVariableField( } ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, offsetArrowBuffer, valueArrowBuffer }; - ArrayData[] children = GetChildren(ref recordBatchEnumerator, field, bodyData, bufferCreator); + ArrayData[] children = GetChildren(version, ref recordBatchEnumerator, field, bodyData, bufferCreator); IArrowArray dictionary = null; if (field.DataType.TypeId == ArrowTypeId.Dictionary) @@ -329,6 +349,7 @@ private ArrayData LoadVariableField( } private ArrayData[] GetChildren( + MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, ByteBuffer bodyData, @@ -345,8 +366,8 @@ private ArrayData[] GetChildren( Field childField = type.Fields[index]; ArrayData child = childField.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator) - : LoadVariableField(ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); + ? LoadPrimitiveField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator) + : LoadVariableField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); children[index] = child; } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index a5d8db3f509d7..2b3815af71142 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -55,6 +55,7 @@ internal class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -156,6 +157,22 @@ public void Visit(StructArray array) } } + public void Visit(UnionArray array) + { + _buffers.Add(CreateBuffer(array.TypeBuffer)); + + ArrowBuffer? offsets = (array as DenseUnionArray)?.ValueOffsetBuffer; + if (offsets != null) + { + _buffers.Add(CreateBuffer(offsets.Value)); + } + + for (int i = 0; i < array.Fields.Count; i++) + { + array.Fields[i].Accept(this); + } + } + public void Visit(DictionaryArray array) { // Dictionary is serialized separately in Dictionary serialization. @@ -218,7 +235,7 @@ public void Visit(IArrowArray array) private readonly bool _leaveOpen; private readonly IpcOptions _options; - private protected const Flatbuf.MetadataVersion CurrentMetadataVersion = Flatbuf.MetadataVersion.V4; + private protected const Flatbuf.MetadataVersion CurrentMetadataVersion = Flatbuf.MetadataVersion.V5; private static readonly byte[] s_padding = new byte[64]; diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 203aa72d93ea3..b11467538dd04 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -120,7 +120,9 @@ public void Visit(FixedSizeListType type) public void Visit(UnionType type) { - throw new NotImplementedException(); + Result = FieldType.Build( + Flatbuf.Type.Union, + Flatbuf.Union.CreateUnion(Builder, ToFlatBuffer(type.Mode), Flatbuf.Union.CreateTypeIdsVector(Builder, type.TypeIds))); } public void Visit(StringType type) @@ -279,5 +281,15 @@ private static Flatbuf.TimeUnit ToFlatBuffer(TimeUnit unit) return result; } + + private static Flatbuf.UnionMode ToFlatBuffer(Types.UnionMode mode) + { + return mode switch + { + Types.UnionMode.Dense => Flatbuf.UnionMode.Dense, + Types.UnionMode.Sparse => Flatbuf.UnionMode.Sparse, + _ => throw new ArgumentException($"unsupported union mode <{mode}>", nameof(mode)), + }; + } } } diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 8ca69b61165bf..6249063ba81f4 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -203,6 +203,10 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c case Flatbuf.Type.Struct_: Debug.Assert(childFields != null); return new Types.StructType(childFields); + case Flatbuf.Type.Union: + Debug.Assert(childFields != null); + Flatbuf.Union unionMetadata = field.Type().Value; + return new Types.UnionType(childFields, unionMetadata.GetTypeIdsArray(), unionMetadata.Mode.ToArrow()); default: throw new InvalidDataException($"Arrow primitive '{field.TypeType}' is unsupported."); } diff --git a/csharp/src/Apache.Arrow/Memory/ExportedAllocationOwner.cs b/csharp/src/Apache.Arrow/Memory/ExportedAllocationOwner.cs index e872dc5425e06..05529899e410c 100644 --- a/csharp/src/Apache.Arrow/Memory/ExportedAllocationOwner.cs +++ b/csharp/src/Apache.Arrow/Memory/ExportedAllocationOwner.cs @@ -14,8 +14,10 @@ // limitations under the License. using System; +using System.Diagnostics; using System.Collections.Generic; using System.Runtime.InteropServices; +using System.Threading; namespace Apache.Arrow.Memory { @@ -23,6 +25,8 @@ internal sealed class ExportedAllocationOwner : INativeAllocationOwner, IDisposa { private readonly List _pointers = new List(); private int _allocationSize; + private long _referenceCount; + private bool _disposed; ~ExportedAllocationOwner() { @@ -47,8 +51,25 @@ public void Release(IntPtr ptr, int offset, int length) throw new InvalidOperationException(); } + public void IncRef() + { + Interlocked.Increment(ref _referenceCount); + } + + public void DecRef() + { + if (Interlocked.Decrement(ref _referenceCount) == 0) + { + Dispose(); + } + } + public void Dispose() { + if (_disposed) + { + return; + } for (int i = 0; i < _pointers.Count; i++) { if (_pointers[i] != IntPtr.Zero) @@ -59,6 +80,7 @@ public void Dispose() } GC.RemoveMemoryPressure(_allocationSize); GC.SuppressFinalize(this); + _disposed = true; } } } diff --git a/csharp/src/Apache.Arrow/Table.cs b/csharp/src/Apache.Arrow/Table.cs index 0b9f31557bec8..939ec23f54ff2 100644 --- a/csharp/src/Apache.Arrow/Table.cs +++ b/csharp/src/Apache.Arrow/Table.cs @@ -37,10 +37,10 @@ public static Table TableFromRecordBatches(Schema schema, IList rec List columns = new List(nColumns); for (int icol = 0; icol < nColumns; icol++) { - List columnArrays = new List(nBatches); + List columnArrays = new List(nBatches); for (int jj = 0; jj < nBatches; jj++) { - columnArrays.Add(recordBatches[jj].Column(icol) as Array); + columnArrays.Add(recordBatches[jj].Column(icol)); } columns.Add(new Column(schema.GetFieldByIndex(icol), columnArrays)); } diff --git a/csharp/src/Apache.Arrow/Types/UnionType.cs b/csharp/src/Apache.Arrow/Types/UnionType.cs index 293271018aa26..23fa3b45ab278 100644 --- a/csharp/src/Apache.Arrow/Types/UnionType.cs +++ b/csharp/src/Apache.Arrow/Types/UnionType.cs @@ -24,20 +24,21 @@ public enum UnionMode Dense } - public sealed class UnionType : ArrowType + public sealed class UnionType : NestedType { public override ArrowTypeId TypeId => ArrowTypeId.Union; public override string Name => "union"; public UnionMode Mode { get; } - - public IEnumerable TypeCodes { get; } + + public int[] TypeIds { get; } public UnionType( - IEnumerable fields, IEnumerable typeCodes, + IEnumerable fields, IEnumerable typeIds, UnionMode mode = UnionMode.Sparse) + : base(fields.ToArray()) { - TypeCodes = typeCodes.ToList(); + TypeIds = typeIds.ToArray(); Mode = mode; } diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj index a6c635a79a45f..cb7f7ae896ee2 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj +++ b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj @@ -3,6 +3,7 @@ Exe + true net7.0 @@ -13,4 +14,4 @@ - \ No newline at end of file + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/CDataInterface.cs b/csharp/test/Apache.Arrow.IntegrationTest/CDataInterface.cs new file mode 100644 index 0000000000000..2fabae1a2a3b3 --- /dev/null +++ b/csharp/test/Apache.Arrow.IntegrationTest/CDataInterface.cs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Data; +using System.Diagnostics; +using System.IO; +using Apache.Arrow.C; +using Apache.Arrow.Arrays; +using Apache.Arrow.Types; + +namespace Apache.Arrow.IntegrationTest +{ + /// + /// Bridge for C Data Interface integration testing. + /// These methods are called from the Python integration testing + /// harness provided by Archery. + /// + public static class CDataInterface + { + // Archery uses the `pythonnet` library (*) to invoke .Net DLLs. + // `pythonnet` is only able to marshal simple types such as int and + // str, which is why we provide trivial wrappers around other APIs. + // + // (*) https://pythonnet.github.io/ + + public static void Initialize() + { + // Allow debugging using Debug.WriteLine() + Trace.Listeners.Add(new ConsoleTraceListener()); + } + + public static unsafe Schema ImportSchema(long ptr) + { + return CArrowSchemaImporter.ImportSchema((CArrowSchema*) ptr); + } + + public static unsafe void ExportSchema(Schema schema, long ptr) + { + CArrowSchemaExporter.ExportSchema(schema, (CArrowSchema*) ptr); + } + + public static unsafe RecordBatch ImportRecordBatch(long ptr, Schema schema) + { + return CArrowArrayImporter.ImportRecordBatch((CArrowArray*) ptr, schema); + } + + public static unsafe void ExportRecordBatch(RecordBatch batch, long ptr) + { + CArrowArrayExporter.ExportRecordBatch(batch, (CArrowArray*) ptr); + } + + public static JsonFile ParseJsonFile(string jsonPath) + { + return JsonFile.Parse(new FileInfo(jsonPath)); + } + + public static void RunGC() + { + GC.Collect(); + GC.WaitForPendingFinalizers(); + } + } +} diff --git a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs index abf7451e5e98c..d19d19f1ce7c1 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs @@ -72,7 +72,7 @@ private async Task Validate() return -1; } - Schema jsonFileSchema = CreateSchema(jsonFile.Schema); + Schema jsonFileSchema = jsonFile.Schema.ToArrow(); Schema arrowFileSchema = reader.Schema; SchemaComparer.Compare(jsonFileSchema, arrowFileSchema); @@ -80,7 +80,7 @@ private async Task Validate() for (int i = 0; i < batchCount; i++) { RecordBatch arrowFileRecordBatch = reader.ReadNextRecordBatch(); - RecordBatch jsonFileRecordBatch = CreateRecordBatch(jsonFileSchema, jsonFile.Batches[i]); + RecordBatch jsonFileRecordBatch = jsonFile.Batches[i].ToArrow(jsonFileSchema); ArrowReaderVerifier.CompareBatches(jsonFileRecordBatch, arrowFileRecordBatch, strictCompare: false); } @@ -98,7 +98,7 @@ private async Task Validate() private async Task JsonToArrow() { JsonFile jsonFile = await ParseJsonFile(); - Schema schema = CreateSchema(jsonFile.Schema); + Schema schema = jsonFile.Schema.ToArrow(); using (FileStream fs = ArrowFileInfo.Create()) { @@ -107,7 +107,7 @@ private async Task JsonToArrow() foreach (var jsonRecordBatch in jsonFile.Batches) { - RecordBatch batch = CreateRecordBatch(schema, jsonRecordBatch); + RecordBatch batch = jsonRecordBatch.ToArrow(schema); await writer.WriteRecordBatchAsync(batch); } await writer.WriteEndAsync(); @@ -117,534 +117,6 @@ private async Task JsonToArrow() return 0; } - private RecordBatch CreateRecordBatch(Schema schema, JsonRecordBatch jsonRecordBatch) - { - if (schema.FieldsList.Count != jsonRecordBatch.Columns.Count) - { - throw new NotSupportedException($"jsonRecordBatch.Columns.Count '{jsonRecordBatch.Columns.Count}' doesn't match schema field count '{schema.FieldsList.Count}'"); - } - - List arrays = new List(jsonRecordBatch.Columns.Count); - for (int i = 0; i < jsonRecordBatch.Columns.Count; i++) - { - JsonFieldData data = jsonRecordBatch.Columns[i]; - Field field = schema.GetFieldByName(data.Name); - ArrayCreator creator = new ArrayCreator(data); - field.DataType.Accept(creator); - arrays.Add(creator.Array); - } - - return new RecordBatch(schema, arrays, jsonRecordBatch.Count); - } - - private static Schema CreateSchema(JsonSchema jsonSchema) - { - Schema.Builder builder = new Schema.Builder(); - for (int i = 0; i < jsonSchema.Fields.Count; i++) - { - builder.Field(f => CreateField(f, jsonSchema.Fields[i])); - } - return builder.Build(); - } - - private static void CreateField(Field.Builder builder, JsonField jsonField) - { - Field[] children = null; - if (jsonField.Children?.Count > 0) - { - children = new Field[jsonField.Children.Count]; - for (int i = 0; i < jsonField.Children.Count; i++) - { - Field.Builder field = new Field.Builder(); - CreateField(field, jsonField.Children[i]); - children[i] = field.Build(); - } - } - - builder.Name(jsonField.Name) - .DataType(ToArrowType(jsonField.Type, children)) - .Nullable(jsonField.Nullable); - - if (jsonField.Metadata != null) - { - builder.Metadata(jsonField.Metadata); - } - } - - private static IArrowType ToArrowType(JsonArrowType type, Field[] children) - { - return type.Name switch - { - "bool" => BooleanType.Default, - "int" => ToIntArrowType(type), - "floatingpoint" => ToFloatingPointArrowType(type), - "decimal" => ToDecimalArrowType(type), - "binary" => BinaryType.Default, - "utf8" => StringType.Default, - "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), - "date" => ToDateArrowType(type), - "time" => ToTimeArrowType(type), - "timestamp" => ToTimestampArrowType(type), - "list" => ToListArrowType(type, children), - "fixedsizelist" => ToFixedSizeListArrowType(type, children), - "struct" => ToStructArrowType(type, children), - "null" => NullType.Default, - _ => throw new NotSupportedException($"JsonArrowType not supported: {type.Name}") - }; - } - - private static IArrowType ToIntArrowType(JsonArrowType type) - { - return (type.BitWidth, type.IsSigned) switch - { - (8, true) => Int8Type.Default, - (8, false) => UInt8Type.Default, - (16, true) => Int16Type.Default, - (16, false) => UInt16Type.Default, - (32, true) => Int32Type.Default, - (32, false) => UInt32Type.Default, - (64, true) => Int64Type.Default, - (64, false) => UInt64Type.Default, - _ => throw new NotSupportedException($"Int type not supported: {type.BitWidth}, {type.IsSigned}") - }; - } - - private static IArrowType ToFloatingPointArrowType(JsonArrowType type) - { - return type.FloatingPointPrecision switch - { - "SINGLE" => FloatType.Default, - "DOUBLE" => DoubleType.Default, - _ => throw new NotSupportedException($"FloatingPoint type not supported: {type.FloatingPointPrecision}") - }; - } - - private static IArrowType ToDecimalArrowType(JsonArrowType type) - { - return type.BitWidth switch - { - 256 => new Decimal256Type(type.DecimalPrecision, type.Scale), - _ => new Decimal128Type(type.DecimalPrecision, type.Scale), - }; - } - - private static IArrowType ToDateArrowType(JsonArrowType type) - { - return type.Unit switch - { - "DAY" => Date32Type.Default, - "MILLISECOND" => Date64Type.Default, - _ => throw new NotSupportedException($"Date type not supported: {type.Unit}") - }; - } - - private static IArrowType ToTimeArrowType(JsonArrowType type) - { - return (type.Unit, type.BitWidth) switch - { - ("SECOND", 32) => new Time32Type(TimeUnit.Second), - ("SECOND", 64) => new Time64Type(TimeUnit.Second), - ("MILLISECOND", 32) => new Time32Type(TimeUnit.Millisecond), - ("MILLISECOND", 64) => new Time64Type(TimeUnit.Millisecond), - ("MICROSECOND", 32) => new Time32Type(TimeUnit.Microsecond), - ("MICROSECOND", 64) => new Time64Type(TimeUnit.Microsecond), - ("NANOSECOND", 32) => new Time32Type(TimeUnit.Nanosecond), - ("NANOSECOND", 64) => new Time64Type(TimeUnit.Nanosecond), - _ => throw new NotSupportedException($"Time type not supported: {type.Unit}, {type.BitWidth}") - }; - } - - private static IArrowType ToTimestampArrowType(JsonArrowType type) - { - return type.Unit switch - { - "SECOND" => new TimestampType(TimeUnit.Second, type.Timezone), - "MILLISECOND" => new TimestampType(TimeUnit.Millisecond, type.Timezone), - "MICROSECOND" => new TimestampType(TimeUnit.Microsecond, type.Timezone), - "NANOSECOND" => new TimestampType(TimeUnit.Nanosecond, type.Timezone), - _ => throw new NotSupportedException($"Time type not supported: {type.Unit}, {type.BitWidth}") - }; - } - - private static IArrowType ToListArrowType(JsonArrowType type, Field[] children) - { - return new ListType(children[0]); - } - - private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) - { - return new FixedSizeListType(children[0], type.ListSize); - } - - private static IArrowType ToStructArrowType(JsonArrowType type, Field[] children) - { - return new StructType(children); - } - - private class ArrayCreator : - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor, - IArrowTypeVisitor - { - private JsonFieldData JsonFieldData { get; set; } - public IArrowArray Array { get; private set; } - - public ArrayCreator(JsonFieldData jsonFieldData) - { - JsonFieldData = jsonFieldData; - } - - public void Visit(BooleanType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - ArrowBuffer.BitmapBuilder valueBuilder = new ArrowBuffer.BitmapBuilder(validityBuffer.Length); - - var json = JsonFieldData.Data.GetRawText(); - bool[] values = JsonSerializer.Deserialize(json); - - foreach (bool value in values) - { - valueBuilder.Append(value); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = new BooleanArray( - valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - public void Visit(Int8Type type) => GenerateArray((v, n, c, nc, o) => new Int8Array(v, n, c, nc, o)); - public void Visit(Int16Type type) => GenerateArray((v, n, c, nc, o) => new Int16Array(v, n, c, nc, o)); - public void Visit(Int32Type type) => GenerateArray((v, n, c, nc, o) => new Int32Array(v, n, c, nc, o)); - public void Visit(Int64Type type) => GenerateLongArray((v, n, c, nc, o) => new Int64Array(v, n, c, nc, o), s => long.Parse(s)); - public void Visit(UInt8Type type) => GenerateArray((v, n, c, nc, o) => new UInt8Array(v, n, c, nc, o)); - public void Visit(UInt16Type type) => GenerateArray((v, n, c, nc, o) => new UInt16Array(v, n, c, nc, o)); - public void Visit(UInt32Type type) => GenerateArray((v, n, c, nc, o) => new UInt32Array(v, n, c, nc, o)); - public void Visit(UInt64Type type) => GenerateLongArray((v, n, c, nc, o) => new UInt64Array(v, n, c, nc, o), s => ulong.Parse(s)); - public void Visit(FloatType type) => GenerateArray((v, n, c, nc, o) => new FloatArray(v, n, c, nc, o)); - public void Visit(DoubleType type) => GenerateArray((v, n, c, nc, o) => new DoubleArray(v, n, c, nc, o)); - public void Visit(Time32Type type) => GenerateArray((v, n, c, nc, o) => new Time32Array(type, v, n, c, nc, o)); - public void Visit(Time64Type type) => GenerateLongArray((v, n, c, nc, o) => new Time64Array(type, v, n, c, nc, o), s => long.Parse(s)); - - public void Visit(Decimal128Type type) - { - Array = new Decimal128Array(GetDecimalArrayData(type)); - } - - public void Visit(Decimal256Type type) - { - Array = new Decimal256Array(GetDecimalArrayData(type)); - } - - public void Visit(NullType type) - { - Array = new NullArray(JsonFieldData.Count); - } - - private ArrayData GetDecimalArrayData(FixedSizeBinaryType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - Span buffer = stackalloc byte[type.ByteWidth]; - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); - foreach (string value in values) - { - buffer.Fill(0); - - BigInteger bigInteger = BigInteger.Parse(value); - if (!bigInteger.TryWriteBytes(buffer, out int bytesWritten, false, !BitConverter.IsLittleEndian)) - { - throw new InvalidDataException($"Decimal data was too big to fit into {type.BitWidth} bits."); - } - - if (bigInteger.Sign == -1) - { - buffer.Slice(bytesWritten).Fill(255); - } - - valueBuilder.Append(buffer); - } - ArrowBuffer valueBuffer = valueBuilder.Build(default); - - return new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, valueBuffer }); - } - - public void Visit(Date32Type type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); - var json = JsonFieldData.Data.GetRawText(); - int[] values = JsonSerializer.Deserialize(json, s_options); - - foreach (int value in values) - { - valueBuilder.Append(value); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = new Date32Array( - valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - public void Visit(Date64Type type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - foreach (string value in values) - { - valueBuilder.Append(long.Parse(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = new Date64Array( - valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - public void Visit(TimestampType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - foreach (string value in values) - { - valueBuilder.Append(long.Parse(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = new TimestampArray( - type, valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - public void Visit(StringType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - ArrowBuffer offsetBuffer = GetOffsetBuffer(); - - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); - foreach (string value in values) - { - valueBuilder.Append(Encoding.UTF8.GetBytes(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(default); - - Array = new StringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); - } - - public void Visit(BinaryType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - ArrowBuffer offsetBuffer = GetOffsetBuffer(); - - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); - foreach (string value in values) - { - valueBuilder.Append(ConvertHexStringToByteArray(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(default); - - ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, offsetBuffer, valueBuffer }); - Array = new BinaryArray(arrayData); - } - - public void Visit(FixedSizeBinaryType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json, s_options); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); - foreach (string value in values) - { - valueBuilder.Append(ConvertHexStringToByteArray(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(default); - - ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, valueBuffer }); - Array = new FixedSizeBinaryArray(arrayData); - } - - public void Visit(ListType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - ArrowBuffer offsetBuffer = GetOffsetBuffer(); - - var data = JsonFieldData; - JsonFieldData = data.Children[0]; - type.ValueDataType.Accept(this); - JsonFieldData = data; - - ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, - new[] { validityBuffer, offsetBuffer }, new[] { Array.Data }); - Array = new ListArray(arrayData); - } - - public void Visit(FixedSizeListType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - var data = JsonFieldData; - JsonFieldData = data.Children[0]; - type.ValueDataType.Accept(this); - JsonFieldData = data; - - ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, - new[] { validityBuffer }, new[] { Array.Data }); - Array = new FixedSizeListArray(arrayData); - } - - public void Visit(StructType type) - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrayData[] children = new ArrayData[type.Fields.Count]; - - var data = JsonFieldData; - for (int i = 0; i < children.Length; i++) - { - JsonFieldData = data.Children[i]; - type.Fields[i].DataType.Accept(this); - children[i] = Array.Data; - } - JsonFieldData = data; - - ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, - new[] { validityBuffer }, children); - Array = new StructArray(arrayData); - } - - private static byte[] ConvertHexStringToByteArray(string hexString) - { - byte[] data = new byte[hexString.Length / 2]; - for (int index = 0; index < data.Length; index++) - { - data[index] = byte.Parse(hexString.AsSpan(index * 2, 2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); - } - - return data; - } - - private static readonly JsonSerializerOptions s_options = new JsonSerializerOptions() - { - Converters = - { - new ByteArrayConverter() - } - }; - - private void GenerateArray(Func createArray) - where TArray : PrimitiveArray - where T : struct - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); - var json = JsonFieldData.Data.GetRawText(); - T[] values = JsonSerializer.Deserialize(json, s_options); - - foreach (T value in values) - { - valueBuilder.Append(value); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = createArray( - valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - private void GenerateLongArray(Func createArray, Func parse) - where TArray : PrimitiveArray - where T : struct - { - ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); - - ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); - var json = JsonFieldData.Data.GetRawText(); - string[] values = JsonSerializer.Deserialize(json); - - foreach (string value in values) - { - valueBuilder.Append(parse(value)); - } - ArrowBuffer valueBuffer = valueBuilder.Build(); - - Array = createArray( - valueBuffer, validityBuffer, - JsonFieldData.Count, nullCount, 0); - } - - private ArrowBuffer GetOffsetBuffer() - { - ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Length); - valueOffsets.AppendRange(JsonFieldData.Offset); - return valueOffsets.Build(default); - } - - private ArrowBuffer GetValidityBuffer(out int nullCount) - { - if (JsonFieldData.Validity == null) - { - nullCount = 0; - return ArrowBuffer.Empty; - } - - ArrowBuffer.BitmapBuilder validityBuilder = new ArrowBuffer.BitmapBuilder(JsonFieldData.Validity.Length); - validityBuilder.AppendRange(JsonFieldData.Validity); - - nullCount = validityBuilder.UnsetBitCount; - return validityBuilder.Build(); - } - - public void Visit(IArrowType type) - { - throw new NotImplementedException($"{type.Name} not implemented"); - } - } - private async Task StreamToFile() { using ArrowStreamReader reader = new ArrowStreamReader(Console.OpenStandardInput()); @@ -691,14 +163,7 @@ private async Task FileToStream() private async ValueTask ParseJsonFile() { - using var fileStream = JsonFileInfo.OpenRead(); - JsonSerializerOptions options = new JsonSerializerOptions() - { - PropertyNamingPolicy = JsonFileNamingPolicy.Instance, - }; - options.Converters.Add(new ValidityConverter()); - - return await JsonSerializer.DeserializeAsync(fileStream, options); + return await JsonFile.ParseAsync(JsonFileInfo); } } } diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index f0f63d3e19b8c..85f66890edf47 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -15,8 +15,16 @@ using System; using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.IO; +using System.Numerics; +using System.Text; using System.Text.Json; using System.Text.Json.Serialization; +using System.Threading.Tasks; +using Apache.Arrow.Arrays; +using Apache.Arrow.Types; namespace Apache.Arrow.IntegrationTest { @@ -25,12 +33,200 @@ public class JsonFile public JsonSchema Schema { get; set; } public List Batches { get; set; } //public List Dictionaries {get;set;} + + public static async ValueTask ParseAsync(FileInfo fileInfo) + { + using var fileStream = fileInfo.OpenRead(); + var options = GetJsonOptions(); + return await JsonSerializer.DeserializeAsync(fileStream, options); + } + + public static JsonFile Parse(FileInfo fileInfo) + { + using var fileStream = fileInfo.OpenRead(); + var options = GetJsonOptions(); + return JsonSerializer.Deserialize(fileStream, options); + } + + private static JsonSerializerOptions GetJsonOptions() + { + JsonSerializerOptions options = new JsonSerializerOptions() + { + PropertyNamingPolicy = JsonFileNamingPolicy.Instance, + }; + options.Converters.Add(new ValidityConverter()); + return options; + } } public class JsonSchema { public List Fields { get; set; } public JsonMetadata Metadata { get; set; } + + /// + /// Decode this JSON schema as a Schema instance. + /// + public Schema ToArrow() + { + return CreateSchema(this); + } + + private static Schema CreateSchema(JsonSchema jsonSchema) + { + Schema.Builder builder = new Schema.Builder(); + for (int i = 0; i < jsonSchema.Fields.Count; i++) + { + builder.Field(f => CreateField(f, jsonSchema.Fields[i])); + } + return builder.Build(); + } + + private static void CreateField(Field.Builder builder, JsonField jsonField) + { + Field[] children = null; + if (jsonField.Children?.Count > 0) + { + children = new Field[jsonField.Children.Count]; + for (int i = 0; i < jsonField.Children.Count; i++) + { + Field.Builder field = new Field.Builder(); + CreateField(field, jsonField.Children[i]); + children[i] = field.Build(); + } + } + + builder.Name(jsonField.Name) + .DataType(ToArrowType(jsonField.Type, children)) + .Nullable(jsonField.Nullable); + + if (jsonField.Metadata != null) + { + builder.Metadata(jsonField.Metadata); + } + } + + private static IArrowType ToArrowType(JsonArrowType type, Field[] children) + { + return type.Name switch + { + "bool" => BooleanType.Default, + "int" => ToIntArrowType(type), + "floatingpoint" => ToFloatingPointArrowType(type), + "decimal" => ToDecimalArrowType(type), + "binary" => BinaryType.Default, + "utf8" => StringType.Default, + "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), + "date" => ToDateArrowType(type), + "time" => ToTimeArrowType(type), + "timestamp" => ToTimestampArrowType(type), + "list" => ToListArrowType(type, children), + "fixedsizelist" => ToFixedSizeListArrowType(type, children), + "struct" => ToStructArrowType(type, children), + "union" => ToUnionArrowType(type, children), + "null" => NullType.Default, + _ => throw new NotSupportedException($"JsonArrowType not supported: {type.Name}") + }; + } + + private static IArrowType ToIntArrowType(JsonArrowType type) + { + return (type.BitWidth, type.IsSigned) switch + { + (8, true) => Int8Type.Default, + (8, false) => UInt8Type.Default, + (16, true) => Int16Type.Default, + (16, false) => UInt16Type.Default, + (32, true) => Int32Type.Default, + (32, false) => UInt32Type.Default, + (64, true) => Int64Type.Default, + (64, false) => UInt64Type.Default, + _ => throw new NotSupportedException($"Int type not supported: {type.BitWidth}, {type.IsSigned}") + }; + } + + private static IArrowType ToFloatingPointArrowType(JsonArrowType type) + { + return type.FloatingPointPrecision switch + { + "SINGLE" => FloatType.Default, + "DOUBLE" => DoubleType.Default, + _ => throw new NotSupportedException($"FloatingPoint type not supported: {type.FloatingPointPrecision}") + }; + } + + private static IArrowType ToDecimalArrowType(JsonArrowType type) + { + return type.BitWidth switch + { + 256 => new Decimal256Type(type.DecimalPrecision, type.Scale), + _ => new Decimal128Type(type.DecimalPrecision, type.Scale), + }; + } + + private static IArrowType ToDateArrowType(JsonArrowType type) + { + return type.Unit switch + { + "DAY" => Date32Type.Default, + "MILLISECOND" => Date64Type.Default, + _ => throw new NotSupportedException($"Date type not supported: {type.Unit}") + }; + } + + private static IArrowType ToTimeArrowType(JsonArrowType type) + { + return (type.Unit, type.BitWidth) switch + { + ("SECOND", 32) => new Time32Type(TimeUnit.Second), + ("SECOND", 64) => new Time64Type(TimeUnit.Second), + ("MILLISECOND", 32) => new Time32Type(TimeUnit.Millisecond), + ("MILLISECOND", 64) => new Time64Type(TimeUnit.Millisecond), + ("MICROSECOND", 32) => new Time32Type(TimeUnit.Microsecond), + ("MICROSECOND", 64) => new Time64Type(TimeUnit.Microsecond), + ("NANOSECOND", 32) => new Time32Type(TimeUnit.Nanosecond), + ("NANOSECOND", 64) => new Time64Type(TimeUnit.Nanosecond), + _ => throw new NotSupportedException($"Time type not supported: {type.Unit}, {type.BitWidth}") + }; + } + + private static IArrowType ToTimestampArrowType(JsonArrowType type) + { + return type.Unit switch + { + "SECOND" => new TimestampType(TimeUnit.Second, type.Timezone), + "MILLISECOND" => new TimestampType(TimeUnit.Millisecond, type.Timezone), + "MICROSECOND" => new TimestampType(TimeUnit.Microsecond, type.Timezone), + "NANOSECOND" => new TimestampType(TimeUnit.Nanosecond, type.Timezone), + _ => throw new NotSupportedException($"Time type not supported: {type.Unit}, {type.BitWidth}") + }; + } + + private static IArrowType ToListArrowType(JsonArrowType type, Field[] children) + { + return new ListType(children[0]); + } + + private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) + { + return new FixedSizeListType(children[0], type.ListSize); + } + + private static IArrowType ToStructArrowType(JsonArrowType type, Field[] children) + { + return new StructType(children); + } + + private static IArrowType ToUnionArrowType(JsonArrowType type, Field[] children) + { + UnionMode mode = type.Mode switch + { + "SPARSE" => UnionMode.Sparse, + "DENSE" => UnionMode.Dense, + _ => throw new NotSupportedException($"Union mode not supported: {type.Mode}"), + }; + return new UnionType(children, type.TypeIds, mode); + } } public class JsonField @@ -60,7 +256,7 @@ public class JsonArrowType public int DecimalPrecision => ExtensionData["precision"].GetInt32(); public int Scale { get; set; } - // date and time fields + // date and time fields public string Unit { get; set; } // timestamp fields public string Timezone { get; set; } @@ -71,6 +267,10 @@ public class JsonArrowType // FixedSizeList fields public int ListSize { get; set; } + // union fields + public string Mode { get; set; } + public int[] TypeIds { get; set; } + [JsonExtensionData] public Dictionary ExtensionData { get; set; } } @@ -90,6 +290,446 @@ public class JsonRecordBatch { public int Count { get; set; } public List Columns { get; set; } + + /// + /// Decode this JSON record batch as a RecordBatch instance. + /// + public RecordBatch ToArrow(Schema schema) + { + return CreateRecordBatch(schema, this); + } + + private RecordBatch CreateRecordBatch(Schema schema, JsonRecordBatch jsonRecordBatch) + { + if (schema.FieldsList.Count != jsonRecordBatch.Columns.Count) + { + throw new NotSupportedException($"jsonRecordBatch.Columns.Count '{jsonRecordBatch.Columns.Count}' doesn't match schema field count '{schema.FieldsList.Count}'"); + } + + List arrays = new List(jsonRecordBatch.Columns.Count); + for (int i = 0; i < jsonRecordBatch.Columns.Count; i++) + { + JsonFieldData data = jsonRecordBatch.Columns[i]; + Field field = schema.FieldsList[i]; + ArrayCreator creator = new ArrayCreator(data); + field.DataType.Accept(creator); + arrays.Add(creator.Array); + } + + return new RecordBatch(schema, arrays, jsonRecordBatch.Count); + } + + private class ArrayCreator : + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor + { + private JsonFieldData JsonFieldData { get; set; } + public IArrowArray Array { get; private set; } + + public ArrayCreator(JsonFieldData jsonFieldData) + { + JsonFieldData = jsonFieldData; + } + + public void Visit(BooleanType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer.BitmapBuilder valueBuilder = new ArrowBuffer.BitmapBuilder(validityBuffer.Length); + + var json = JsonFieldData.Data.GetRawText(); + bool[] values = JsonSerializer.Deserialize(json); + + foreach (bool value in values) + { + valueBuilder.Append(value); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = new BooleanArray( + valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + public void Visit(Int8Type type) => GenerateArray((v, n, c, nc, o) => new Int8Array(v, n, c, nc, o)); + public void Visit(Int16Type type) => GenerateArray((v, n, c, nc, o) => new Int16Array(v, n, c, nc, o)); + public void Visit(Int32Type type) => GenerateArray((v, n, c, nc, o) => new Int32Array(v, n, c, nc, o)); + public void Visit(Int64Type type) => GenerateLongArray((v, n, c, nc, o) => new Int64Array(v, n, c, nc, o), s => long.Parse(s)); + public void Visit(UInt8Type type) => GenerateArray((v, n, c, nc, o) => new UInt8Array(v, n, c, nc, o)); + public void Visit(UInt16Type type) => GenerateArray((v, n, c, nc, o) => new UInt16Array(v, n, c, nc, o)); + public void Visit(UInt32Type type) => GenerateArray((v, n, c, nc, o) => new UInt32Array(v, n, c, nc, o)); + public void Visit(UInt64Type type) => GenerateLongArray((v, n, c, nc, o) => new UInt64Array(v, n, c, nc, o), s => ulong.Parse(s)); + public void Visit(FloatType type) => GenerateArray((v, n, c, nc, o) => new FloatArray(v, n, c, nc, o)); + public void Visit(DoubleType type) => GenerateArray((v, n, c, nc, o) => new DoubleArray(v, n, c, nc, o)); + public void Visit(Time32Type type) => GenerateArray((v, n, c, nc, o) => new Time32Array(type, v, n, c, nc, o)); + public void Visit(Time64Type type) => GenerateLongArray((v, n, c, nc, o) => new Time64Array(type, v, n, c, nc, o), s => long.Parse(s)); + + public void Visit(Decimal128Type type) + { + Array = new Decimal128Array(GetDecimalArrayData(type)); + } + + public void Visit(Decimal256Type type) + { + Array = new Decimal256Array(GetDecimalArrayData(type)); + } + + public void Visit(NullType type) + { + Array = new NullArray(JsonFieldData.Count); + } + + private ArrayData GetDecimalArrayData(FixedSizeBinaryType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + Span buffer = stackalloc byte[type.ByteWidth]; + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + buffer.Fill(0); + + BigInteger bigInteger = BigInteger.Parse(value); + if (!bigInteger.TryWriteBytes(buffer, out int bytesWritten, false, !BitConverter.IsLittleEndian)) + { + throw new InvalidDataException($"Decimal data was too big to fit into {type.BitWidth} bits."); + } + + if (bigInteger.Sign == -1) + { + buffer.Slice(bytesWritten).Fill(255); + } + + valueBuilder.Append(buffer); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + return new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, valueBuffer }); + } + + public void Visit(Date32Type type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); + var json = JsonFieldData.Data.GetRawText(); + int[] values = JsonSerializer.Deserialize(json, s_options); + + foreach (int value in values) + { + valueBuilder.Append(value); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = new Date32Array( + valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + public void Visit(Date64Type type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + foreach (string value in values) + { + valueBuilder.Append(long.Parse(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = new Date64Array( + valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + public void Visit(TimestampType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + foreach (string value in values) + { + valueBuilder.Append(long.Parse(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = new TimestampArray( + type, valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + public void Visit(StringType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + valueBuilder.Append(Encoding.UTF8.GetBytes(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + Array = new StringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); + } + + public void Visit(BinaryType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + valueBuilder.Append(ConvertHexStringToByteArray(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, offsetBuffer, valueBuffer }); + Array = new BinaryArray(arrayData); + } + + public void Visit(FixedSizeBinaryType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + valueBuilder.Append(ConvertHexStringToByteArray(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, valueBuffer }); + Array = new FixedSizeBinaryArray(arrayData); + } + + public void Visit(ListType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer, offsetBuffer }, new[] { Array.Data }); + Array = new ListArray(arrayData); + } + + public void Visit(FixedSizeListType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer }, new[] { Array.Data }); + Array = new FixedSizeListArray(arrayData); + } + + public void Visit(StructType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrayData[] children = new ArrayData[type.Fields.Count]; + + var data = JsonFieldData; + for (int i = 0; i < children.Length; i++) + { + JsonFieldData = data.Children[i]; + type.Fields[i].DataType.Accept(this); + children[i] = Array.Data; + } + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer }, children); + Array = new StructArray(arrayData); + } + + public void Visit(UnionType type) + { + ArrowBuffer[] buffers; + if (type.Mode == UnionMode.Dense) + { + buffers = new ArrowBuffer[2]; + buffers[1] = GetOffsetBuffer(); + } + else + { + buffers = new ArrowBuffer[1]; + } + buffers[0] = GetTypeIdBuffer(); + + ArrayData[] children = GetChildren(type); + + int nullCount = 0; + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers, children); + Array = UnionArray.Create(arrayData); + } + + private ArrayData[] GetChildren(NestedType type) + { + ArrayData[] children = new ArrayData[type.Fields.Count]; + + var data = JsonFieldData; + for (int i = 0; i < children.Length; i++) + { + JsonFieldData = data.Children[i]; + type.Fields[i].DataType.Accept(this); + children[i] = Array.Data; + } + JsonFieldData = data; + + return children; + } + + private static byte[] ConvertHexStringToByteArray(string hexString) + { + byte[] data = new byte[hexString.Length / 2]; + for (int index = 0; index < data.Length; index++) + { + data[index] = byte.Parse(hexString.AsSpan(index * 2, 2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); + } + + return data; + } + + private static readonly JsonSerializerOptions s_options = new JsonSerializerOptions() + { + Converters = + { + new ByteArrayConverter() + } + }; + + private void GenerateArray(Func createArray) + where TArray : PrimitiveArray + where T : struct + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); + var json = JsonFieldData.Data.GetRawText(); + T[] values = JsonSerializer.Deserialize(json, s_options); + + foreach (T value in values) + { + valueBuilder.Append(value); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = createArray( + valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + private void GenerateLongArray(Func createArray, Func parse) + where TArray : PrimitiveArray + where T : struct + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(JsonFieldData.Count); + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json); + + foreach (string value in values) + { + valueBuilder.Append(parse(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(); + + Array = createArray( + valueBuffer, validityBuffer, + JsonFieldData.Count, nullCount, 0); + } + + private ArrowBuffer GetOffsetBuffer() + { + ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Length); + valueOffsets.AppendRange(JsonFieldData.Offset); + return valueOffsets.Build(default); + } + + private ArrowBuffer GetTypeIdBuffer() + { + ArrowBuffer.Builder typeIds = new ArrowBuffer.Builder(JsonFieldData.TypeId.Length); + for (int i = 0; i < JsonFieldData.TypeId.Length; i++) + { + typeIds.Append(checked((byte)JsonFieldData.TypeId[i])); + } + return typeIds.Build(default); + } + + private ArrowBuffer GetValidityBuffer(out int nullCount) + { + if (JsonFieldData.Validity == null) + { + nullCount = 0; + return ArrowBuffer.Empty; + } + + ArrowBuffer.BitmapBuilder validityBuilder = new ArrowBuffer.BitmapBuilder(JsonFieldData.Validity.Length); + validityBuilder.AppendRange(JsonFieldData.Validity); + + nullCount = validityBuilder.UnsetBitCount; + return validityBuilder.Build(); + } + + public void Visit(IArrowType type) + { + throw new NotImplementedException($"{type.Name} not implemented"); + } + } } public class JsonFieldData diff --git a/csharp/test/Apache.Arrow.Tests/ArrayTypeComparer.cs b/csharp/test/Apache.Arrow.Tests/ArrayTypeComparer.cs index 77584aefb1bf4..c8bcc3cee0f99 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrayTypeComparer.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrayTypeComparer.cs @@ -28,7 +28,8 @@ public class ArrayTypeComparer : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, - IArrowTypeVisitor + IArrowTypeVisitor, + IArrowTypeVisitor { private readonly IArrowType _expectedType; @@ -114,6 +115,22 @@ public void Visit(StructType actualType) CompareNested(expectedType, actualType); } + public void Visit(UnionType actualType) + { + Assert.IsAssignableFrom(_expectedType); + UnionType expectedType = (UnionType)_expectedType; + + Assert.Equal(expectedType.Mode, actualType.Mode); + + Assert.Equal(expectedType.TypeIds.Length, actualType.TypeIds.Length); + for (int i = 0; i < expectedType.TypeIds.Length; i++) + { + Assert.Equal(expectedType.TypeIds[i], actualType.TypeIds[i]); + } + + CompareNested(expectedType, actualType); + } + private static void CompareNested(NestedType expectedType, NestedType actualType) { Assert.Equal(expectedType.Fields.Count, actualType.Fields.Count); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index 36cffe7eb4da1..f5a2c345e2ae6 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -77,6 +77,22 @@ private static IEnumerable, IArrowArray>> GenerateTestDa new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() }), new FixedSizeListType(Int32Type.Default, 1), + new UnionType( + new List{ + new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), + new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() + }, + new[] { 0, 1 }, + UnionMode.Sparse + ), + new UnionType( + new List{ + new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), + new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() + }, + new[] { 0, 1 }, + UnionMode.Dense + ), }; foreach (IArrowType type in targetTypes) @@ -119,7 +135,8 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, - IArrowTypeVisitor + IArrowTypeVisitor, + IArrowTypeVisitor { private List> _baseData; @@ -392,6 +409,91 @@ public void Visit(StructType type) ExpectedArray = new StructArray(type, 3, new List { resultStringArray, resultInt32Array }, nullBitmapBuffer, 1); } + public void Visit(UnionType type) + { + bool isDense = type.Mode == UnionMode.Dense; + + StringArray.Builder stringResultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); + Int32Array.Builder intResultBuilder = new Int32Array.Builder().Reserve(_baseDataTotalElementCount); + ArrowBuffer.Builder typeResultBuilder = new ArrowBuffer.Builder().Reserve(_baseDataTotalElementCount); + ArrowBuffer.Builder offsetResultBuilder = new ArrowBuffer.Builder().Reserve(_baseDataTotalElementCount); + int resultNullCount = 0; + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + StringArray.Builder stringBuilder = new StringArray.Builder().Reserve(dataList.Count); + Int32Array.Builder intBuilder = new Int32Array.Builder().Reserve(dataList.Count); + ArrowBuffer.Builder typeBuilder = new ArrowBuffer.Builder().Reserve(dataList.Count); + ArrowBuffer.Builder offsetBuilder = new ArrowBuffer.Builder().Reserve(dataList.Count); + int nullCount = 0; + + for (int j = 0; j < dataList.Count; j++) + { + byte index = (byte)Math.Max(j % 3, 1); + int? intValue = (index == 1) ? dataList[j] : null; + string stringValue = (index == 1) ? null : dataList[j]?.ToString(); + typeBuilder.Append(index); + + if (isDense) + { + if (index == 0) + { + offsetBuilder.Append(stringBuilder.Length); + offsetResultBuilder.Append(stringResultBuilder.Length); + stringBuilder.Append(stringValue); + stringResultBuilder.Append(stringValue); + } + else + { + offsetBuilder.Append(intBuilder.Length); + offsetResultBuilder.Append(intResultBuilder.Length); + intBuilder.Append(intValue); + intResultBuilder.Append(intValue); + } + } + else + { + stringBuilder.Append(stringValue); + stringResultBuilder.Append(stringValue); + intBuilder.Append(intValue); + intResultBuilder.Append(intValue); + } + + if (dataList[j] == null) + { + nullCount++; + resultNullCount++; + } + } + + ArrowBuffer[] buffers; + if (isDense) + { + buffers = new[] { typeBuilder.Build(), offsetBuilder.Build() }; + } + else + { + buffers = new[] { typeBuilder.Build() }; + } + TestTargetArrayList.Add(UnionArray.Create(new ArrayData( + type, dataList.Count, nullCount, 0, buffers, + new[] { stringBuilder.Build().Data, intBuilder.Build().Data }))); + } + + ArrowBuffer[] resultBuffers; + if (isDense) + { + resultBuffers = new[] { typeResultBuilder.Build(), offsetResultBuilder.Build() }; + } + else + { + resultBuffers = new[] { typeResultBuilder.Build() }; + } + ExpectedArray = UnionArray.Create(new ArrayData( + type, _baseDataTotalElementCount, resultNullCount, 0, resultBuffers, + new[] { stringResultBuilder.Build().Data, intResultBuilder.Build().Data })); + } public void Visit(IArrowType type) { diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index e588eab51e1fc..8b41763a70ac8 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -91,6 +91,7 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -151,6 +152,24 @@ public void Visit(StructArray array) } } + public void Visit(UnionArray array) + { + Assert.IsAssignableFrom(_expectedArray); + UnionArray expectedArray = (UnionArray)_expectedArray; + + Assert.Equal(expectedArray.Mode, array.Mode); + Assert.Equal(expectedArray.Length, array.Length); + Assert.Equal(expectedArray.NullCount, array.NullCount); + Assert.Equal(expectedArray.Offset, array.Offset); + Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); + Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); + + for (int i = 0; i < array.Fields.Count; i++) + { + array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare)); + } + } + public void Visit(DictionaryArray array) { Assert.IsAssignableFrom(_expectedArray); diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 29b1b9e7db74a..b6b65a582d953 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -99,8 +99,8 @@ private static Schema GetTestSchema() .Field(f => f.Name("time64_us").DataType(new Time64Type(TimeUnit.Microsecond)).Nullable(false)) .Field(f => f.Name("time64_ns").DataType(new Time64Type(TimeUnit.Nanosecond)).Nullable(false)) - .Field(f => f.Name("timestamp_ns").DataType(new TimestampType(TimeUnit.Nanosecond, "")).Nullable(false)) - .Field(f => f.Name("timestamp_us").DataType(new TimestampType(TimeUnit.Microsecond, "")).Nullable(false)) + .Field(f => f.Name("timestamp_ns").DataType(new TimestampType(TimeUnit.Nanosecond, (string) null)).Nullable(false)) + .Field(f => f.Name("timestamp_us").DataType(new TimestampType(TimeUnit.Microsecond, (string) null)).Nullable(false)) .Field(f => f.Name("timestamp_us_paris").DataType(new TimestampType(TimeUnit.Microsecond, "Europe/Paris")).Nullable(true)) .Field(f => f.Name("list_string").DataType(new ListType(StringType.Default)).Nullable(false)) @@ -112,6 +112,9 @@ private static Schema GetTestSchema() .Field(f => f.Name("dict_string_ordered").DataType(new DictionaryType(Int32Type.Default, StringType.Default, true)).Nullable(false)) .Field(f => f.Name("list_dict_string").DataType(new ListType(new DictionaryType(Int32Type.Default, StringType.Default, false))).Nullable(false)) + .Field(f => f.Name("dense_union").DataType(new UnionType(new[] { new Field("i64", Int64Type.Default, false), new Field("f32", FloatType.Default, true), }, new[] { 0, 1 }, UnionMode.Dense))) + .Field(f => f.Name("sparse_union").DataType(new UnionType(new[] { new Field("i32", Int32Type.Default, true), new Field("f64", DoubleType.Default, false), }, new[] { 0, 1 }, UnionMode.Sparse))) + // Checking wider characters. .Field(f => f.Name("hello 你好 😄").DataType(BooleanType.Default).Nullable(true)) @@ -172,6 +175,9 @@ private static IEnumerable GetPythonFields() yield return pa.field("dict_string_ordered", pa.dictionary(pa.int32(), pa.utf8(), true), false); yield return pa.field("list_dict_string", pa.list_(pa.dictionary(pa.int32(), pa.utf8(), false)), false); + yield return pa.field("dense_union", pa.dense_union(List(pa.field("i64", pa.int64(), false), pa.field("f32", pa.float32(), true)))); + yield return pa.field("sparse_union", pa.sparse_union(List(pa.field("i32", pa.int32(), true), pa.field("f64", pa.float64(), false)))); + yield return pa.field("hello 你好 😄", pa.bool_(), true); } } @@ -485,22 +491,29 @@ public unsafe void ImportRecordBatch() pa.array(List(0.0, 1.4, 2.5, 3.6, 4.7)), pa.array(new PyObject[] { List(1, 2), List(3, 4), PyObject.None, PyObject.None, List(5, 4, 3) }), pa.StructArray.from_arrays( - new PyList(new PyObject[] - { + List( List(10, 9, null, null, null), List("banana", "apple", "orange", "cherry", "grape"), - List(null, 4.3, -9, 123.456, 0), - }), + List(null, 4.3, -9, 123.456, 0) + ), new[] { "fld1", "fld2", "fld3" }), pa.DictionaryArray.from_arrays( pa.array(List(1, 0, 1, 1, null)), - pa.array(List("foo", "bar")) - ), + pa.array(List("foo", "bar"))), pa.FixedSizeListArray.from_arrays( pa.array(List(1, 2, 3, 4, null, 6, 7, null, null, null)), 2), + pa.UnionArray.from_dense( + pa.array(List(0, 1, 1, 0, 0), type: "int8"), + pa.array(List(0, 0, 1, 1, 2), type: "int32"), + List( + pa.array(List(1, 4, null)), + pa.array(List("two", "three")) + ), + /* field name */ List("i32", "s"), + /* type codes */ List(3, 2)), }), - new[] { "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8" }); + new[] { "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9" }); dynamic batch = table.to_batches()[0]; @@ -568,6 +581,10 @@ public unsafe void ImportRecordBatch() Assert.Equal(new long[] { 1, 2, 3, 4, 0, 6, 7, 0, 0, 0 }, col8a.Values.ToArray()); Assert.True(col8a.IsValid(3)); Assert.False(col8a.IsValid(9)); + + UnionArray col9 = (UnionArray)recordBatch.Column("col9"); + Assert.Equal(5, col9.Length); + Assert.True(col9 is DenseUnionArray); } [SkippableFact] @@ -789,6 +806,11 @@ private static PyObject List(params string[] values) return new PyList(values.Select(i => i == null ? PyObject.None : new PyString(i)).ToArray()); } + private static PyObject List(params PyObject[] values) + { + return new PyList(values); + } + sealed class TestArrayStream : IArrowArrayStream { private readonly RecordBatch[] _batches; diff --git a/csharp/test/Apache.Arrow.Tests/ColumnTests.cs b/csharp/test/Apache.Arrow.Tests/ColumnTests.cs index b90c681622d5f..2d867b79176aa 100644 --- a/csharp/test/Apache.Arrow.Tests/ColumnTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ColumnTests.cs @@ -39,7 +39,7 @@ public void TestColumn() Array intArrayCopy = MakeIntArray(10); Field field = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build(); - Column column = new Column(field, new[] { intArray, intArrayCopy }); + Column column = new Column(field, new IArrowArray[] { intArray, intArrayCopy }); Assert.True(column.Name == field.Name); Assert.True(column.Field == field); diff --git a/csharp/test/Apache.Arrow.Tests/TableTests.cs b/csharp/test/Apache.Arrow.Tests/TableTests.cs index b4c4b1faed190..8b07a38c1b8c0 100644 --- a/csharp/test/Apache.Arrow.Tests/TableTests.cs +++ b/csharp/test/Apache.Arrow.Tests/TableTests.cs @@ -30,7 +30,7 @@ public static Table MakeTableWithOneColumnOfTwoIntArrays(int lengthOfEachArray) Field field = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build(); Schema s0 = new Schema.Builder().Field(field).Build(); - Column column = new Column(field, new List { intArray, intArrayCopy }); + Column column = new Column(field, new List { intArray, intArrayCopy }); Table table = new Table(s0, new List { column }); return table; } @@ -60,7 +60,7 @@ public void TestTableFromRecordBatches() Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches); Assert.Equal(20, table1.RowCount); - Assert.Equal(24, table1.ColumnCount); + Assert.Equal(26, table1.ColumnCount); FixedSizeBinaryType type = new FixedSizeBinaryType(17); Field newField1 = new Field(type.Name, type, false); @@ -86,13 +86,13 @@ public void TestTableAddRemoveAndSetColumn() Array nonEqualLengthIntArray = ColumnTests.MakeIntArray(10); Field field1 = new Field.Builder().Name("f1").DataType(Int32Type.Default).Build(); - Column nonEqualLengthColumn = new Column(field1, new[] { nonEqualLengthIntArray}); + Column nonEqualLengthColumn = new Column(field1, new IArrowArray[] { nonEqualLengthIntArray }); Assert.Throws(() => table.InsertColumn(-1, nonEqualLengthColumn)); Assert.Throws(() => table.InsertColumn(1, nonEqualLengthColumn)); Array equalLengthIntArray = ColumnTests.MakeIntArray(20); Field field2 = new Field.Builder().Name("f2").DataType(Int32Type.Default).Build(); - Column equalLengthColumn = new Column(field2, new[] { equalLengthIntArray}); + Column equalLengthColumn = new Column(field2, new IArrowArray[] { equalLengthIntArray }); Column existingColumn = table.Column(0); Table newTable = table.InsertColumn(0, equalLengthColumn); @@ -118,7 +118,7 @@ public void TestBuildFromRecordBatch() RecordBatch batch = TestData.CreateSampleRecordBatch(schema, 10); Table table = Table.TableFromRecordBatches(schema, new[] { batch }); - Assert.NotNull(table.Column(0).Data.Array(0) as Int64Array); + Assert.NotNull(table.Column(0).Data.ArrowArray(0) as Int64Array); } } diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index 41507311f6a04..9e2061e3428a9 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -60,6 +60,8 @@ public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); builder.Field(CreateField(new FixedSizeBinaryType(16), i)); builder.Field(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); + builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); + builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); } //builder.Field(CreateField(HalfFloatType.Default)); @@ -125,6 +127,7 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -315,6 +318,67 @@ public void Visit(StructType type) Array = new StructArray(type, Length, childArrays, nullBitmap.Build()); } + public void Visit(UnionType type) + { + int[] lengths = new int[type.Fields.Count]; + if (type.Mode == UnionMode.Sparse) + { + for (int i = 0; i < lengths.Length; i++) + { + lengths[i] = Length; + } + } + else + { + int totalLength = Length; + int oneLength = Length / lengths.Length; + for (int i = 1; i < lengths.Length; i++) + { + lengths[i] = oneLength; + totalLength -= oneLength; + } + lengths[0] = totalLength; + } + + ArrayData[] childArrays = new ArrayData[type.Fields.Count]; + for (int i = 0; i < childArrays.Length; i++) + { + childArrays[i] = CreateArray(type.Fields[i], lengths[i]).Data; + } + + ArrowBuffer.Builder typeIdBuilder = new ArrowBuffer.Builder(Length); + byte index = 0; + for (int i = 0; i < Length; i++) + { + typeIdBuilder.Append(index); + index++; + if (index == lengths.Length) + { + index = 0; + } + } + + ArrowBuffer[] buffers; + if (type.Mode == UnionMode.Sparse) + { + buffers = new ArrowBuffer[1]; + } + else + { + ArrowBuffer.Builder offsetBuilder = new ArrowBuffer.Builder(Length); + for (int i = 0; i < Length; i++) + { + offsetBuilder.Append(i / lengths.Length); + } + + buffers = new ArrowBuffer[2]; + buffers[1] = offsetBuilder.Build(); + } + buffers[0] = typeIdBuilder.Build(); + + Array = UnionArray.Create(new ArrayData(type, Length, 0, 0, buffers, childArrays)); + } + public void Visit(DictionaryType type) { Int32Array.Builder indicesBuilder = new Int32Array.Builder().Reserve(Length); diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index 1cf19841c6939..ea10e75ad3478 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -284,7 +284,7 @@ class CommentReport(Report): 'github': _markdown_badge.format( title='Github Actions', badge=( - 'https://github.com/{repo}/workflows/Crossbow/' + 'https://github.com/{repo}/actions/workflows/crossbow.yml/' 'badge.svg?branch={branch}' ), ), diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 5ac32da56a8de..8d0cc6b0b01a8 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1805,8 +1805,7 @@ def _temp_path(): generate_datetime_case(), generate_duration_case() - .skip_tester('C#') - .skip_tester('JS'), # TODO(ARROW-5239): Intervals + JS + .skip_tester('C#'), generate_interval_case() .skip_tester('C#') @@ -1833,8 +1832,7 @@ def _temp_path(): .skip_tester('C#') .skip_tester('JS'), - generate_unions_case() - .skip_tester('C#'), + generate_unions_case(), generate_custom_metadata_case() .skip_tester('C#'), diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 2fd1d2d7f0c44..eb2e26951cd88 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -70,6 +70,7 @@ def __init__(self, json_files, self.serial = serial self.gold_dirs = gold_dirs self.failures: List[Outcome] = [] + self.skips: List[Outcome] = [] self.match = match if self.match is not None: @@ -123,8 +124,8 @@ def run_c_data(self): enabled implementations. """ for producer, consumer in itertools.product( - filter(lambda t: t.C_DATA_EXPORTER, self.testers), - filter(lambda t: t.C_DATA_IMPORTER, self.testers)): + filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers), + filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers)): self._compare_c_data_implementations(producer, consumer) log('\n') @@ -207,6 +208,8 @@ def case_wrapper(test_case): self.failures.append(outcome.failure) if self.stop_on_error: break + elif outcome.skipped: + self.skips.append(outcome) else: with ThreadPoolExecutor() as executor: @@ -215,6 +218,8 @@ def case_wrapper(test_case): self.failures.append(outcome.failure) if self.stop_on_error: break + elif outcome.skipped: + self.skips.append(outcome) def _compare_ipc_implementations( self, @@ -423,9 +428,10 @@ def _compare_c_data_implementations( exporter, importer) self._run_test_cases(case_runner, self.json_files, serial=serial) - case_runner = partial(self._run_c_array_test_cases, producer, consumer, - exporter, importer) - self._run_test_cases(case_runner, self.json_files, serial=serial) + if producer.C_DATA_ARRAY_EXPORTER and consumer.C_DATA_ARRAY_IMPORTER: + case_runner = partial(self._run_c_array_test_cases, producer, consumer, + exporter, importer) + self._run_test_cases(case_runner, self.json_files, serial=serial) def _run_c_schema_test_case(self, producer: Tester, consumer: Tester, @@ -605,6 +611,11 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, description="Ensure PollFlightInfo is supported.", skip_testers={"JS", "C#", "Rust"} ), + Scenario( + "app_metadata_flight_info_endpoint", + description="Ensure support FlightInfo and Endpoint app_metadata", + skip_testers={"JS", "C#", "Rust", "Java"} + ), Scenario( "flight_sql", description="Ensure Flight SQL protocol is working as expected.", @@ -638,7 +649,7 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, log(f'{exc_type}: {exc_value}') log() - log(fail_count, "failures") + log(f"{fail_count} failures, {len(runner.skips)} skips") if fail_count > 0: sys.exit(1) diff --git a/dev/archery/archery/integration/tester.py b/dev/archery/archery/integration/tester.py index 6a3061992d006..6cde20e61b321 100644 --- a/dev/archery/archery/integration/tester.py +++ b/dev/archery/archery/integration/tester.py @@ -204,9 +204,11 @@ class Tester: # whether the language supports receiving Flight FLIGHT_CLIENT = False # whether the language supports the C Data Interface as an exporter - C_DATA_EXPORTER = False + C_DATA_SCHEMA_EXPORTER = False + C_DATA_ARRAY_EXPORTER = False # whether the language supports the C Data Interface as an importer - C_DATA_IMPORTER = False + C_DATA_SCHEMA_IMPORTER = False + C_DATA_ARRAY_IMPORTER = False # the name used for skipping and shown in the logs name = "unknown" diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py index 9ddc3c480002a..ab642c31aacc6 100644 --- a/dev/archery/archery/integration/tester_cpp.py +++ b/dev/archery/archery/integration/tester_cpp.py @@ -52,8 +52,10 @@ class CppTester(Tester): CONSUMER = True FLIGHT_SERVER = True FLIGHT_CLIENT = True - C_DATA_EXPORTER = True - C_DATA_IMPORTER = True + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True name = 'C++' diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 018731d573cac..83b07495f9907 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -15,23 +15,143 @@ # specific language governing permissions and limitations # under the License. +from contextlib import contextmanager +import gc import os -from .tester import Tester +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT -_EXE_PATH = os.path.join( - ARROW_ROOT_DEFAULT, - "csharp/artifacts/Apache.Arrow.IntegrationTest", - "Debug/net7.0/Apache.Arrow.IntegrationTest", -) +_ARTIFACTS_PATH = os.path.join(ARROW_ROOT_DEFAULT, "csharp/artifacts") + +_EXE_PATH = os.path.join(_ARTIFACTS_PATH, + "Apache.Arrow.IntegrationTest", + "Debug/net7.0/Apache.Arrow.IntegrationTest", + ) + +_clr_loaded = False + + +def _load_clr(): + global _clr_loaded + if not _clr_loaded: + _clr_loaded = True + import pythonnet + pythonnet.load("coreclr") + import clr + clr.AddReference( + f"{_ARTIFACTS_PATH}/Apache.Arrow.IntegrationTest/" + f"Debug/net7.0/Apache.Arrow.IntegrationTest.dll") + clr.AddReference( + f"{_ARTIFACTS_PATH}/Apache.Arrow.Tests/" + f"Debug/net7.0/Apache.Arrow.Tests.dll") + + from Apache.Arrow.IntegrationTest import CDataInterface + CDataInterface.Initialize() + + +@contextmanager +def _disposing(disposable): + """ + Ensure the IDisposable object is disposed of when the enclosed block exits. + """ + try: + yield disposable + finally: + disposable.Dispose() + + +class _CDataBase: + + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + _load_clr() + + def _pointer_to_int(self, c_ptr): + return int(self.ffi.cast('uintptr_t', c_ptr)) + + def _read_batch_from_json(self, json_path, num_batch): + from Apache.Arrow.IntegrationTest import CDataInterface + + jf = CDataInterface.ParseJsonFile(json_path) + schema = jf.Schema.ToArrow() + return schema, jf.Batches[num_batch].ToArrow(schema) + + +class CSharpCDataExporter(CDataExporter, _CDataBase): + + def export_schema_from_json(self, json_path, c_schema_ptr): + from Apache.Arrow.IntegrationTest import CDataInterface + + jf = CDataInterface.ParseJsonFile(json_path) + CDataInterface.ExportSchema(jf.Schema.ToArrow(), + self._pointer_to_int(c_schema_ptr)) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + from Apache.Arrow.IntegrationTest import CDataInterface + + _, batch = self._read_batch_from_json(json_path, num_batch) + with _disposing(batch): + CDataInterface.ExportRecordBatch(batch, + self._pointer_to_int(c_array_ptr)) + + @property + def supports_releasing_memory(self): + # XXX the C# GC doesn't give reliable allocation measurements + return False + + +class CSharpCDataImporter(CDataImporter, _CDataBase): + + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + from Apache.Arrow.IntegrationTest import CDataInterface + from Apache.Arrow.Tests import SchemaComparer + + jf = CDataInterface.ParseJsonFile(json_path) + imported_schema = CDataInterface.ImportSchema( + self._pointer_to_int(c_schema_ptr)) + SchemaComparer.Compare(jf.Schema.ToArrow(), imported_schema) + + def import_batch_and_compare_to_json(self, json_path, num_batch, + c_array_ptr): + from Apache.Arrow.IntegrationTest import CDataInterface + from Apache.Arrow.Tests import ArrowReaderVerifier + + schema, batch = self._read_batch_from_json(json_path, num_batch) + with _disposing(batch): + imported_batch = CDataInterface.ImportRecordBatch( + self._pointer_to_int(c_array_ptr), schema) + with _disposing(imported_batch): + ArrowReaderVerifier.CompareBatches(batch, imported_batch, + strictCompare=False) + + @property + def supports_releasing_memory(self): + return True + + def gc_until(self, predicate): + from Apache.Arrow.IntegrationTest import CDataInterface + for i in range(3): + if predicate(): + return True + # Collect any C# objects hanging around through Python + gc.collect() + CDataInterface.RunGC() + return predicate() class CSharpTester(Tester): PRODUCER = True CONSUMER = True + C_DATA_SCHEMA_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_ARRAY_IMPORTER = True name = 'C#' @@ -68,3 +188,9 @@ def file_to_stream(self, file_path, stream_path): cmd.extend(['--mode', 'file-to-stream']) cmd.extend(['-a', file_path, '>', stream_path]) self.run_shell_command(cmd) + + def make_c_data_exporter(self): + return CSharpCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return CSharpCDataImporter(self.debug, self.args) diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py index fea33cd0ac6c1..75333db8d66d5 100644 --- a/dev/archery/archery/integration/tester_go.py +++ b/dev/archery/archery/integration/tester_go.py @@ -16,11 +16,14 @@ # under the License. import contextlib +import functools import os import subprocess -from .tester import Tester +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log +from ..utils.source import ARROW_ROOT_DEFAULT # FIXME(sbinet): revisit for Go modules @@ -39,12 +42,23 @@ "localhost", ] +_dll_suffix = ".dll" if os.name == "nt" else ".so" + +_DLL_PATH = os.path.join( + ARROW_ROOT_DEFAULT, + "go/arrow/internal/cdata_integration") +_INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + _dll_suffix) + class GoTester(Tester): PRODUCER = True CONSUMER = True FLIGHT_SERVER = True FLIGHT_CLIENT = True + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True name = 'Go' @@ -119,3 +133,123 @@ def flight_request(self, port, json_path=None, scenario_name=None): if self.debug: log(' '.join(cmd)) run_cmd(cmd) + + def make_c_data_exporter(self): + return GoCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return GoCDataImporter(self.debug, self.args) + + +_go_c_data_entrypoints = """ + const char* ArrowGo_ExportSchemaFromJson(const char* json_path, + uintptr_t out); + const char* ArrowGo_ImportSchemaAndCompareToJson( + const char* json_path, uintptr_t c_schema); + + const char* ArrowGo_ExportBatchFromJson(const char* json_path, + int num_batch, + uintptr_t out); + const char* ArrowGo_ImportBatchAndCompareToJson( + const char* json_path, int num_batch, uintptr_t c_array); + + int64_t ArrowGo_BytesAllocated(); + void ArrowGo_RunGC(); + void ArrowGo_FreeError(const char*); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_go_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _pointer_to_int(self, c_ptr): + return self.ffi.cast('uintptr_t', c_ptr) + + def _check_go_error(self, go_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is dynamically allocated on the Go side. + """ + assert self.ffi.typeof(go_error) is self.ffi.typeof("const char*") + if go_error != self.ffi.NULL: + try: + error = self.ffi.string(go_error).decode('utf8', + errors='replace') + raise RuntimeError( + f"Go C Data Integration call failed: {error}") + finally: + self.dll.ArrowGo_FreeError(go_error) + + def _run_gc(self): + self.dll.ArrowGo_RunGC() + + +class GoCDataExporter(CDataExporter, _CDataBase): + # Note: the Arrow Go C Data export functions expect their output + # ArrowStream or ArrowArray argument to be zero-initialized. + # This is currently ensured through the use of `ffi.new`. + + def export_schema_from_json(self, json_path, c_schema_ptr): + go_error = self.dll.ArrowGo_ExportSchemaFromJson( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_go_error(go_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + go_error = self.dll.ArrowGo_ExportBatchFromJson( + str(json_path).encode(), num_batch, + self._pointer_to_int(c_array_ptr)) + self._check_go_error(go_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + self._run_gc() + return self.dll.ArrowGo_BytesAllocated() + + def compare_allocation_state(self, recorded, gc_until): + def pred(): + return self.record_allocation_state() == recorded + + return gc_until(pred) + + +class GoCDataImporter(CDataImporter, _CDataBase): + + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + go_error = self.dll.ArrowGo_ImportSchemaAndCompareToJson( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_go_error(go_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, + c_array_ptr): + go_error = self.dll.ArrowGo_ImportBatchAndCompareToJson( + str(json_path).encode(), num_batch, + self._pointer_to_int(c_array_ptr)) + self._check_go_error(go_error) + + @property + def supports_releasing_memory(self): + return True + + def gc_until(self, predicate): + for i in range(10): + if predicate(): + return True + self._run_gc() + return False diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 18c93a5b8b71b..3efe5994055db 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -149,6 +149,7 @@ def cmake_linter(src, fix=False): include_patterns=[ 'ci/**/*.cmake', 'cpp/CMakeLists.txt', + 'cpp/src/**/*.cmake', 'cpp/src/**/*.cmake.in', 'cpp/src/**/CMakeLists.txt', 'cpp/examples/**/CMakeLists.txt', diff --git a/dev/release/01-prepare-test.rb b/dev/release/01-prepare-test.rb index 1062e8b06c090..54437e9da60ce 100644 --- a/dev/release/01-prepare-test.rb +++ b/dev/release/01-prepare-test.rb @@ -170,7 +170,8 @@ def test_version_pre_tag "+ \"name\": \"#{@release_compatible_version} (stable)\",", "+ {", "+ \"name\": \"#{@previous_compatible_version}\",", - "+ \"version\": \"#{@previous_compatible_version}/\"", + "+ \"version\": \"#{@previous_compatible_version}/\",", + "+ \"url\": \"https://arrow.apache.org/docs/#{@previous_compatible_version}/\"", "+ },", ], ], diff --git a/dev/release/post-03-website.sh b/dev/release/post-03-website.sh index faf79625952cc..cb605aee83523 100755 --- a/dev/release/post-03-website.sh +++ b/dev/release/post-03-website.sh @@ -58,6 +58,7 @@ fi export TZ=UTC release_date=$(LC_TIME=C date "+%-d %B %Y") +release_date_iso8601=$(LC_TIME=C date "+%Y-%m-%d") previous_tag_date=$(git log -n 1 --pretty=%aI apache-arrow-${previous_version}) rough_previous_release_date=$(date --date "${previous_tag_date}" +%s) rough_release_date=$(date +%s) @@ -257,7 +258,7 @@ current: number: '${version}' pinned_number: '${pinned_version}' major_number: '${major_version}' - date: '${release_date}' + date: '${release_date_iso8601}' git-tag: '${git_tag_hash}' github-tag-link: 'https://github.com/apache/arrow/releases/tag/${git_tag}' release-notes: 'https://arrow.apache.org/release/${version}.html' diff --git a/dev/release/post-11-bump-versions-test.rb b/dev/release/post-11-bump-versions-test.rb index 0ef4646236740..8253472ccc5b9 100644 --- a/dev/release/post-11-bump-versions-test.rb +++ b/dev/release/post-11-bump-versions-test.rb @@ -148,7 +148,8 @@ def test_version_post_tag "+ \"name\": \"#{@release_compatible_version} (stable)\",", "+ {", "+ \"name\": \"#{@previous_compatible_version}\",", - "+ \"version\": \"#{@previous_compatible_version}/\"", + "+ \"version\": \"#{@previous_compatible_version}/\",", + "+ \"url\": \"https://arrow.apache.org/docs/#{@previous_compatible_version}/\"", "+ },", ], ], diff --git a/dev/release/utils-update-docs-versions.py b/dev/release/utils-update-docs-versions.py index 6e0137b7c84df..7ca4059214db5 100644 --- a/dev/release/utils-update-docs-versions.py +++ b/dev/release/utils-update-docs-versions.py @@ -50,11 +50,15 @@ # Create new versions new_versions = [ {"name": f"{dev_compatible_version} (dev)", - "version": "dev/"}, + "version": "dev/", + "url": "https://arrow.apache.org/docs/dev/"}, {"name": f"{stable_compatible_version} (stable)", - "version": ""}, + "version": "", + "url": "https://arrow.apache.org/docs/", + "preferred": True}, {"name": previous_compatible_version, - "version": f"{previous_compatible_version}/"}, + "version": f"{previous_compatible_version}/", + "url": f"https://arrow.apache.org/docs/{previous_compatible_version}/"}, *old_versions[2:], ] with open(main_versions_path, 'w') as json_file: diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 77b996766f78c..ae28ebe792404 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -666,7 +666,7 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv "cython<3" numpy setuptools_scm setuptools || exit 1 + maybe_setup_virtualenv "cython<3" numpy "setuptools_scm<8.0.0" setuptools || exit 1 maybe_setup_conda --file ci/conda_env_python.txt || exit 1 if [ "${USE_CONDA}" -gt 0 ]; then diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml index 1cdcec199e7ba..042e2364d1c49 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml @@ -1,7 +1,7 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -33,20 +33,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -67,7 +65,7 @@ snappy: target_platform: - linux-64 thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml index 5be5b58a73932..9885e6db38cd7 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml @@ -1,7 +1,7 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -33,20 +33,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -67,7 +65,7 @@ snappy: target_platform: - linux-64 thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml index 1677b03564c08..788b584504ec4 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml @@ -1,9 +1,9 @@ BUILD: - aarch64-conda_cos7-linux-gnu aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -37,20 +37,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -71,7 +69,7 @@ snappy: target_platform: - linux-aarch64 thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml index 88fdf1254e661..a1e4b8571abaf 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml @@ -1,9 +1,9 @@ BUILD: - aarch64-conda_cos7-linux-gnu aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -37,20 +37,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -71,7 +69,7 @@ snappy: target_platform: - linux-aarch64 thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml index 3585db7b99baa..e21c4cbe853f8 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml @@ -1,7 +1,7 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -33,20 +33,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -67,7 +65,7 @@ snappy: target_platform: - linux-ppc64le thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml index c13a522254286..89f1049ebdd84 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml @@ -1,7 +1,7 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -33,20 +33,18 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -67,7 +65,7 @@ snappy: target_platform: - linux-ppc64le thrift_cpp: -- 0.18.1 +- 0.19.0 ucx: - 1.14.0 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml index dd4a230760ef2..2a5f8c5b36bd3 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml @@ -1,9 +1,9 @@ MACOSX_DEPLOYMENT_TARGET: -- '10.9' +- '10.13' aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -27,22 +27,20 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 macos_machine: - x86_64-apple-darwin13.4.0 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -63,7 +61,7 @@ snappy: target_platform: - osx-64 thrift_cpp: -- 0.18.1 +- 0.19.0 zip_keys: - - c_compiler_version - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml index 6a6713a54fe86..211b71226cae8 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml @@ -1,9 +1,9 @@ MACOSX_DEPLOYMENT_TARGET: - '11.0' aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' c_compiler: @@ -27,22 +27,20 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 macos_machine: - arm64-apple-darwin20.0.0 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -63,7 +61,7 @@ snappy: target_platform: - osx-arm64 thrift_cpp: -- 0.18.1 +- 0.19.0 zip_keys: - - c_compiler_version - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml similarity index 98% rename from dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml rename to dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml index e63767cbe9771..a4d06c9f20cdd 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml @@ -19,7 +19,7 @@ pin_run_as_build: min_pin: x.x max_pin: x.x r_base: -- '4.1' +- '4.3' target_platform: - linux-64 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml similarity index 98% rename from dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml rename to dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml index 2b80b020fdc0b..028b190bb1ef5 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml @@ -23,7 +23,7 @@ pin_run_as_build: min_pin: x.x max_pin: x.x r_base: -- '4.1' +- '4.3' target_platform: - linux-aarch64 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml similarity index 98% rename from dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml rename to dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml index 6be6c2f5462c5..7b8b62d8e00bb 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml @@ -19,7 +19,7 @@ pin_run_as_build: min_pin: x.x max_pin: x.x r_base: -- '4.1' +- '4.3' target_platform: - osx-64 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml similarity index 98% rename from dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml rename to dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml index 0ce856fcccf5c..a8e8aab83d598 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml @@ -19,7 +19,7 @@ pin_run_as_build: min_pin: x.x max_pin: x.x r_base: -- '4.1' +- '4.3' target_platform: - osx-arm64 zip_keys: diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml index f75d92e276d9e..32da33c072019 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml @@ -1,11 +1,9 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' -c_ares: -- '1' c_compiler: - vs2019 channel_sources: @@ -27,24 +25,22 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libcrc32c: - '1.1' libcurl: - '8' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -65,7 +61,7 @@ snappy: target_platform: - win-64 thrift_cpp: -- 0.18.1 +- 0.19.0 zip_keys: - - cuda_compiler - cuda_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml index 6d8fb15b15a2a..6a33b86b9d65e 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml @@ -1,11 +1,9 @@ aws_crt_cpp: -- 0.20.3 +- 0.23.1 aws_sdk_cpp: -- 1.10.57 +- 1.11.156 bzip2: - '1' -c_ares: -- '1' c_compiler: - vs2019 channel_sources: @@ -27,24 +25,22 @@ glog: google_cloud_cpp: - '2.12' libabseil: -- '20230125' +- '20230802' libcrc32c: - '1.1' libcurl: - '8' libgrpc: -- '1.54' -- '1.56' +- '1.57' libprotobuf: -- '3.21' -- 4.23.3 +- 4.23.4 lz4_c: - 1.9.3 numpy: -- '1.21' +- '1.22' - '1.23' -- '1.21' -- '1.21' +- '1.22' +- '1.22' openssl: - '3' orc: @@ -65,7 +61,7 @@ snappy: target_platform: - win-64 thrift_cpp: -- 0.18.1 +- 0.19.0 zip_keys: - - cuda_compiler - cuda_compiler_version diff --git a/dev/tasks/conda-recipes/arrow-cpp/activate.sh b/dev/tasks/conda-recipes/arrow-cpp/activate.sh index 8757612781bbe..19d037ff4127a 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/activate.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/activate.sh @@ -23,6 +23,13 @@ _la_log "Beginning libarrow activation." # where the GDB wrappers get installed _la_gdb_prefix="$CONDA_PREFIX/share/gdb/auto-load" +# If the directory is not writable, nothing can be done +if [ ! -w "$_la_gdb_prefix" ]; then + _la_log 'No rights to modify $_la_gdb_prefix, cannot create symlink!' + _la_log 'Unless you plan to use the GDB debugger with libarrow, this warning can be safely ignored.' + return +fi + # this needs to be in sync with ARROW_GDB_INSTALL_DIR in build.sh _la_placeholder="replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX" # the paths here are intentionally stacked, see #935, resp. @@ -44,7 +51,7 @@ for _la_target in "$_la_orig_install_dir/"*.py; do # If the file doesn't exist, skip this iteration of the loop. # (This happens when no files are found, in which case the # loop runs with target equal to the pattern itself.) - _la_log 'Folder $_la_orig_install_dir seems to not contain .py files, skipping' + _la_log 'Folder $_la_orig_install_dir seems to not contain .py files, skipping.' continue fi _la_symlink="$_la_symlink_dir/$(basename "$_la_target")" @@ -54,13 +61,13 @@ for _la_target in "$_la_orig_install_dir/"*.py; do _la_log 'symlink $_la_symlink already exists and points to $_la_target, skipping.' continue fi - _la_log 'Creating symlink $_la_symlink pointing to $_la_target' + _la_log 'Creating symlink $_la_symlink pointing to $_la_target.' mkdir -p "$_la_symlink_dir" || true # this check also creates the symlink; if it fails, we enter the if-branch. if ! ln -sf "$_la_target" "$_la_symlink"; then - echo -n "${BASH_SOURCE[0]} ERROR: Failed to create symlink from " - echo -n "'$_la_target' to '$_la_symlink'" - echo + echo -n "${BASH_SOURCE[0]} WARNING: Failed to create symlink from " + echo "'$_la_target' to '$_la_symlink'!" + echo "Unless you plan to use the GDB debugger with libarrow, this warning can be safely ignored." continue fi done diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh index dc588f9473870..ef0b038812a01 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh @@ -30,7 +30,7 @@ fi # Enable CUDA support if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]] then - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CONDA_BUILD_SYSROOT}/lib" + EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDAToolkit_ROOT=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CONDA_BUILD_SYSROOT}/lib" else EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=OFF" fi @@ -43,8 +43,8 @@ if [[ "${build_platform}" != "${target_platform}" ]]; then fi EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCLANG_EXECUTABLE=${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang" EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DLLVM_LINK_EXECUTABLE=${BUILD_PREFIX}/bin/llvm-link" + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DARROW_JEMALLOC_LG_PAGE=16" sed -ie "s;protoc-gen-grpc.*$;protoc-gen-grpc=${BUILD_PREFIX}/bin/grpc_cpp_plugin\";g" ../src/arrow/flight/CMakeLists.txt - sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=16";g' ../cmake_modules/ThirdpartyToolchain.cmake fi # disable -fno-plt, which causes problems with GCC on PPC diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh index 9c12321a1c115..f39e06874ca0e 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh @@ -24,6 +24,10 @@ BUILD_EXT_FLAGS="" # Enable CUDA support if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]]; then export PYARROW_WITH_CUDA=1 + if [[ "${build_platform}" != "${target_platform}" ]]; then + export CUDAToolkit_ROOT=${CUDA_HOME} + export CMAKE_LIBRARY_PATH=${CONDA_BUILD_SYSROOT}/lib + fi else export PYARROW_WITH_CUDA=0 fi diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index fbe40af3dae01..371b62245bb72 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -61,7 +61,7 @@ outputs: build: string: h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} run_exports: - - {{ pin_subpackage("libarrow", max_pin="x.x.x") }} + - {{ pin_subpackage("libarrow", max_pin="x") }} ignore_run_exports_from: - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] # arrow only uses headers, apparently @@ -114,6 +114,8 @@ outputs: - libgrpc - libprotobuf - libutf8proc + # gandiva requires shared libllvm + - llvm # [unix] - lz4-c - nlohmann_json # gandiva depends on openssl @@ -133,8 +135,6 @@ outputs: # its host deps (which aren't yet covered above) leak into the build here - libcrc32c # [win] - libcurl # [win] - # same for libgrpc (before 1.55.0, which is coupled with libprotobuf 4.23.x) - - c-ares # [win and libprotobuf == "3.21"] run_constrained: - apache-arrow-proc =*={{ build_ext }} # make sure we don't co-install with old version of old package name @@ -198,8 +198,6 @@ outputs: requirements: host: - {{ pin_subpackage('libarrow', exact=True) }} - # avoid wrappers for different builds colliding due to identical hashes - - libprotobuf run: - {{ pin_subpackage('libarrow', exact=True) }} test: @@ -235,9 +233,7 @@ outputs: - cmake - ninja host: - # we're building for two protobuf versions, cannot pin exactly - # - {{ pin_subpackage('libarrow', exact=True) }} - - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} + - {{ pin_subpackage('libarrow', exact=True) }} - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - cython <3 @@ -246,8 +242,7 @@ outputs: - setuptools - setuptools_scm <8.0.0 run: - # - {{ pin_subpackage('libarrow', exact=True) }} - - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} + - {{ pin_subpackage('libarrow', exact=True) }} - {{ pin_compatible('numpy') }} - python run_constrained: @@ -336,28 +331,28 @@ outputs: # crossbow CI: reduce to one python version, except on (unemulated) linux, where it's fast enough {% if linux64 or py == 311 %} - # {% if not (aarch64 or ppc64le) or py in (310, 311) %} - # only run the full test suite for one python version when in emulation (each run takes ~45min); - # there's essentially zero divergence in behaviour across python versions anyway, and otherwise - # CUDA builds for aarch/ppc consistently run out of disk space on azure for some reason + # {% if not (aarch64 or ppc64le) or py == 311 %} + # only run the full test suite for one python version when in emulation + # (each run can take up to ~45min); there's essentially zero divergence + # in behaviour across python versions anyway test: requires: - # vary protobuf version in test suite (historically, test failures only have a very - # weak dependency on python version, so we don't lose coverage by doing half & half) - - libprotobuf <4 # [py % 2 == 0] # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - # temporary pin due to missing fixture - - pytest <7.4.0 + - pytest - pytest-lazy-fixture - backports.zoneinfo # [py<39] + - boto3 - cffi - cloudpickle - cython <3 - fastparquet - fsspec - hypothesis + # currently disabled due to GH-37692 + # - minio-server - pandas + - s3fs >=2023 - scipy # these are generally (far) behind on migrating abseil/grpc/protobuf, # and using them as test dependencies blocks the migrator unnecessarily @@ -372,8 +367,8 @@ outputs: source_files: - testing/data commands: - - cd ${SP_DIR}/pyarrow/tests # [unix] - - cd %SP_DIR%\pyarrow\tests # [win] + - cd ${SP_DIR} # [unix] + - cd %SP_DIR% # [win] - export ARROW_TEST_DATA="${SRC_DIR}/testing/data" # [unix] - set "ARROW_TEST_DATA=%SRC_DIR%\testing\data" # [win] @@ -382,34 +377,26 @@ outputs: {% set tests_to_skip = tests_to_skip + " or test_cuda" %} # skip tests that raise SIGINT and crash the test suite {% set tests_to_skip = tests_to_skip + " or (test_csv and test_cancellation)" %} # [linux] - {% set tests_to_skip = tests_to_skip + " or (test_flight and test_interrupt)" %} # [linux] - # tests that may crash the agent due to out-of-bound memory writes or other risky stuff - {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool" %} # [aarch64 or ppc64le] - # cannot pass -D_LIBCPP_DISABLE_AVAILABILITY to test suite for our older macos sdk - {% set tests_to_skip = tests_to_skip + " or test_cpp_extension_in_python" %} # [osx] + # skip test that intentionally writes out of bounds and then expects no error message + {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool_disabled[system_memory_pool]" %} # [osx] # skip tests that make invalid(-for-conda) assumptions about the compilers setup {% set tests_to_skip = tests_to_skip + " or test_cython_api" %} # [unix] {% set tests_to_skip = tests_to_skip + " or test_visit_strings" %} # [unix] # skip tests that cannot succeed in emulation {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool_disabled" %} # [aarch64 or ppc64le] {% set tests_to_skip = tests_to_skip + " or test_env_var_io_thread_count" %} # [aarch64 or ppc64le] + # XMinioInvalidObjectName on osx/win: "Object name contains unsupported characters" + {% set tests_to_skip = tests_to_skip + " or test_write_to_dataset_with_partitions_s3fs" %} # [osx or win] # vvvvvvv TESTS THAT SHOULDN'T HAVE TO BE SKIPPED vvvvvvv - {% set tests_to_skip = tests_to_skip + " or test_extension_to_pandas_storage_type" %} - # segfaults on OSX: to investigate ASAP - {% set tests_to_skip = tests_to_skip + " or test_flight" %} # [osx] + # currently broken + {% set tests_to_skip = tests_to_skip + " or test_fastparquet_cross_compatibility" %} # gandiva tests are segfaulting on ppc - {% set tests_to_skip = tests_to_skip + " or test_gandiva" %} # [ppc64le] - # test failures on ppc + {% set tests_to_skip = tests_to_skip + " or test_gandiva" %} # [ppc64le] + # test failures on ppc (both failing with: Float value was truncated converting to int32) {% set tests_to_skip = tests_to_skip + " or test_safe_cast_from_float_with_nans_to_int" %} # [ppc64le] - # gandiva tests are segfaulting on ppc - {% set tests_to_skip = tests_to_skip + " or test_float_with_null_as_integer" %} # [ppc64le] - # test is broken; header is in $PREFIX, not $SP_DIR - {% set tests_to_skip = tests_to_skip + " or (test_misc and test_get_include)" %} # [unix] - # flaky tests that fail occasionally - {% set tests_to_skip = tests_to_skip + " or test_total_bytes_allocated " %} # [linux] - {% set tests_to_skip = tests_to_skip + " or test_feather_format " %} # [linux] + {% set tests_to_skip = tests_to_skip + " or test_float_with_null_as_integer" %} # [ppc64le] # ^^^^^^^ TESTS THAT SHOULDN'T HAVE TO BE SKIPPED ^^^^^^^ - - pytest -rfEs -k "not ({{ tests_to_skip }})" + - pytest pyarrow/ -rfEs -k "not ({{ tests_to_skip }})" {% endif %} about: diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 29e038a922412..859ff8ddb5b44 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -246,15 +246,15 @@ tasks: # generated and to be synced regularly from the feedstock. We have no way # yet to generate them inside the arrow repository automatically. - conda-linux-x64-cpu-r41: + conda-linux-x64-cpu-r43: ci: azure template: conda-recipes/azure.linux.yml params: config: linux_64_cuda_compiler_versionNone - r_config: linux_64_r_base4.1 + r_config: linux_64_r_base4.3 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda conda-linux-x64-cpu-r42: ci: azure @@ -292,15 +292,15 @@ tasks: ########################### Conda Linux (aarch64) ########################### - conda-linux-aarch64-cpu-r41: + conda-linux-aarch64-cpu-r43: ci: azure template: conda-recipes/azure.linux.yml params: config: linux_aarch64_cuda_compiler_versionNone - r_config: linux_aarch64_r_base4.1 + r_config: linux_aarch64_r_base4.3 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda conda-linux-aarch64-cpu-r42: ci: azure @@ -364,15 +364,15 @@ tasks: ############################## Conda OSX (x64) ############################## - conda-osx-x64-cpu-r41: + conda-osx-x64-cpu-r43: ci: azure template: conda-recipes/azure.osx.yml params: config: osx_64_ - r_config: osx_64_r_base4.1 + r_config: osx_64_r_base4.3 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda conda-osx-x64-cpu-r42: ci: azure @@ -398,15 +398,15 @@ tasks: ############################# Conda OSX (arm64) ############################# - conda-osx-arm64-cpu-r41: + conda-osx-arm64-cpu-r43: ci: azure template: conda-recipes/azure.osx.yml params: config: osx_arm64_ - r_config: osx_arm64_r_base4.1 + r_config: osx_arm64_r_base4.3 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda conda-osx-arm64-cpu-r42: ci: azure diff --git a/docker-compose.yml b/docker-compose.yml index 8ae06900c57f9..62e5aee0a841c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1732,6 +1732,7 @@ services: <<: [*common, *ccache] # tell archery where the arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_GO_INTEGRATION: 1 ARCHERY_INTEGRATION_WITH_RUST: 0 command: ["/arrow/ci/scripts/rust_build.sh /arrow /build && diff --git a/docs/requirements.txt b/docs/requirements.txt index a4e5f7197b553..37a50d51dd54c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,10 +5,9 @@ breathe ipython numpydoc -pydata-sphinx-theme==0.8 +pydata-sphinx-theme sphinx-autobuild sphinx-design sphinx-copybutton -sphinxcontrib-jquery sphinx==6.2 pandas diff --git a/docs/source/_static/arrow-dark.png b/docs/source/_static/arrow-dark.png new file mode 100644 index 0000000000000..618204a2370a5 Binary files /dev/null and b/docs/source/_static/arrow-dark.png differ diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index e64d40f1116e5..eeba0ef4cce6e 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -21,43 +21,15 @@ /* Customizing with theme CSS variables */ :root { - --pst-color-active-navigation: 215, 70, 51; - --pst-color-link-hover: 215, 70, 51; - --pst-color-headerlink: 215, 70, 51; - /* Use normal text color (like h3, ..) instead of primary color */ - --pst-color-h1: var(--color-text-base); - --pst-color-h2: var(--color-text-base); - /* Use softer blue from bootstrap's default info color */ - --pst-color-info: 23, 162, 184; - --pst-header-height: 0px; -} - -code { - color: rgb(215, 70, 51); -} - -.footer { - text-align: center; -} - -/* Ensure the logo is properly displayed */ - -.navbar-brand { - height: auto; - width: auto; -} - -a.navbar-brand img { - height: auto; - width: auto; - max-height: 15vh; - max-width: 100%; + /* Change header hight to make the logo a bit larger */ + --pst-header-height: 6rem; + /* Make headings more bold */ + --pst-font-weight-heading: 600; } /* Contibuting landing page overview cards */ .contrib-card { - background: #fff; border-radius: 0; padding: 30px 10px 20px 10px; margin: 10px 0px; @@ -70,12 +42,12 @@ a.navbar-brand img { .contrib-card .sd-card-img-top { margin: 2px; height: 75px; + background: none !important; } .contrib-card .sd-card-title { - /* color: rgb(var(--pst-color-h1)) !important; */ + color: var(--pst-color-primary); font-size: var(--pst-font-size-h3); - /* font-weight: bold; */ padding: 1rem 0rem 0.5rem 0rem; } @@ -112,48 +84,3 @@ dl.cpp.enumerator { p.breathe-sectiondef-title { margin-top: 1rem; } - -/* Limit the max height of the sidebar navigation section. Because in our -custimized template, there is more content above the navigation, i.e. -larger logo: if we don't decrease the max-height, it will overlap with -the footer. -Details: min(15vh, 110px) for the logo size, 8rem for search box etc*/ - -@media (min-width:720px) { - @supports (position:-webkit-sticky) or (position:sticky) { - .bd-links { - max-height: calc(100vh - min(15vh, 110px) - 8rem) - } - } -} - -/* Styling to get the version dropdown and search box side-by-side on wide screens */ - -#version-search-wrapper { - width: inherit; - display: flex; - flex-wrap: wrap; - justify-content: left; - align-items: center; -} - -#version-button { - padding-left: 0.5rem; - padding-right: 1rem; -} - -#search-box { - flex: 1 0 12em; -} - -/* Fix table text wrapping in RTD theme, - * see https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html - */ - -@media screen { - table.docutils td { - /* !important prevents the common CSS stylesheets from overriding - this as on RTD they are loaded after this stylesheet */ - white-space: normal !important; - } -} diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index f91b0a17e7774..8d9c5878c8213 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -1,62 +1,73 @@ [ { "name": "14.0 (dev)", - "version": "dev/" + "version": "dev/", + "url": "https://arrow.apache.org/docs/dev/" }, { "name": "13.0 (stable)", - "version": "" + "version": "", + "url": "https://arrow.apache.org/docs/", + "preferred": true }, { "name": "12.0", - "version": "12.0/" - }, - { - "name": "12.0", - "version": "12.0/" + "version": "12.0/", + "url": "https://arrow.apache.org/docs/12.0/" }, { "name": "11.0", - "version": "11.0/" + "version": "11.0/", + "url": "https://arrow.apache.org/docs/11.0/" }, { "name": "10.0", - "version": "10.0/" + "version": "10.0/", + "url": "https://arrow.apache.org/docs/10.0/" }, { "name": "9.0", - "version": "9.0/" + "version": "9.0/", + "url": "https://arrow.apache.org/docs/9.0/" }, { "name": "8.0", - "version": "8.0/" + "version": "8.0/", + "url": "https://arrow.apache.org/docs/8.0/" }, { "name": "7.0", - "version": "7.0/" + "version": "7.0/", + "url": "https://arrow.apache.org/docs/7.0/" }, { "name": "6.0", - "version": "6.0/" + "version": "6.0/", + "url": "https://arrow.apache.org/docs/6.0/" }, { "name": "5.0", - "version": "5.0/" + "version": "5.0/", + "url": "https://arrow.apache.org/docs/5.0/" }, { "name": "4.0", - "version": "4.0/" + "version": "4.0/", + "url": "https://arrow.apache.org/docs/4.0/" }, { "name": "3.0", - "version": "3.0/" + "version": "3.0/", + "url": "https://arrow.apache.org/docs/3.0/" }, { "name": "2.0", - "version": "2.0/" + "version": "2.0/", + "url": "https://arrow.apache.org/docs/2.0/" }, { "name": "1.0", - "version": "1.0/" + "version": "1.0/", + "url": "https://arrow.apache.org/docs/dev/" } ] diff --git a/docs/source/_static/versionwarning.js b/docs/source/_static/versionwarning.js index 601b93b75ddd8..e53c160ed98f7 100644 --- a/docs/source/_static/versionwarning.js +++ b/docs/source/_static/versionwarning.js @@ -17,6 +17,8 @@ (function() { // adapted 2022-11 from https://mne.tools/versionwarning.js + // Not used anymore for versions 14.0.0 and higher + // Kept for older docs versions (13.0.0 and lower) if (location.hostname == 'arrow.apache.org') { $.getJSON("https://arrow.apache.org/docs/_static/versions.json", function(data){ var latestStable = data[1].name.replace(" (stable)",""); diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html deleted file mode 100644 index 26d42a82f1d5c..0000000000000 --- a/docs/source/_templates/docs-sidebar.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - -
- -{% include "version-switcher.html" %} - - - -
- - diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html index ca39e8e5a8fae..956e0142c5062 100644 --- a/docs/source/_templates/layout.html +++ b/docs/source/_templates/layout.html @@ -22,13 +22,3 @@ {% endblock %} - -{# Silence the navbar #} -{% block docs_navbar %} -{% endblock %} - -{# Add version warnings #} -{% block footer %} - {{ super() }} - -{% endblock %} diff --git a/docs/source/_templates/version-switcher.html b/docs/source/_templates/version-switcher.html deleted file mode 100644 index 24a8c15ac0102..0000000000000 --- a/docs/source/_templates/version-switcher.html +++ /dev/null @@ -1,60 +0,0 @@ - - - diff --git a/docs/source/c_glib/index.rst b/docs/source/c_glib/index.rst index 56db23f2a2040..b10524eb2e8a5 100644 --- a/docs/source/c_glib/index.rst +++ b/docs/source/c_glib/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _c-glib: + C/GLib docs =========== diff --git a/docs/source/conf.py b/docs/source/conf.py index 23b7070c4a84e..e9e8969f55254 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -115,7 +115,6 @@ 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', 'numpydoc', - "sphinxcontrib.jquery", 'sphinx_design', 'sphinx_copybutton', 'sphinx.ext.autodoc', @@ -288,16 +287,37 @@ # further. For a list of options available for each theme, see the # documentation. # + html_theme_options = { "show_toc_level": 2, "use_edit_page_button": True, + "logo": { + "image_light": "_static/arrow.png", + "image_dark": "_static/arrow-dark.png", + }, + "header_links_before_dropdown": 2, + "header_dropdown_text": "Language implementations", + "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/apache/arrow", + "icon": "fa-brands fa-square-github", + }, + { + "name": "Twitter", + "url": "https://twitter.com/ApacheArrow", + "icon": "fa-brands fa-square-twitter", + }, + ], + "show_version_warning_banner": True, + "switcher": { + "json_url": "/docs/_static/versions.json", + "version_match": version, + }, } html_context = { - "switcher_json_url": "/docs/_static/versions.json", - "switcher_template_url": "https://arrow.apache.org/docs/{version}", - # for local testing - # "switcher_template_url": "http://0.0.0.0:8000/docs/{version}", "github_user": "apache", "github_repo": "arrow", "github_version": "main", @@ -319,7 +339,7 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. # -html_logo = "_static/arrow.png" +# html_logo = "_static/arrow.png" # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or @@ -354,10 +374,9 @@ # Custom sidebar templates, maps document names to template names. # -html_sidebars = { +# html_sidebars = { # '**': ['sidebar-logo.html', 'sidebar-search-bs.html', 'sidebar-nav-bs.html'], - '**': ['docs-sidebar.html'], -} +# } # The base URL which points to the root of the HTML documentation, # used for canonical url diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index e06453e202979..6d4d4aaa8148c 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _cpp: + C++ Implementation ================== @@ -25,9 +27,9 @@ Welcome to the Apache Arrow C++ implementation documentation! :padding: 2 2 0 0 :class-container: sd-text-center - .. grid-item-card:: Basic understanding + .. grid-item-card:: Getting started :class-card: contrib-card - :shadow: md + :shadow: none Start here to gain a basic understanding of Arrow with an installation and linking guide, documentation of @@ -37,14 +39,14 @@ Welcome to the Apache Arrow C++ implementation documentation! .. button-link:: getting_started.html :click-parent: - :color: secondary + :color: primary :expand: - Getting started + To Getting started .. grid-item-card:: User Guide :class-card: contrib-card - :shadow: md + :shadow: none Explore more specific topics and underlying concepts of Arrow C++ @@ -53,19 +55,19 @@ Welcome to the Apache Arrow C++ implementation documentation! .. button-link:: user_guide.html :click-parent: - :color: secondary + :color: primary :expand: - User Guide + To the User Guide .. grid:: 2 :gutter: 4 :padding: 2 2 0 0 :class-container: sd-text-center - .. grid-item-card:: Examples of use + .. grid-item-card:: Examples :class-card: contrib-card - :shadow: md + :shadow: none Find the description and location of the examples using Arrow C++ library @@ -74,14 +76,14 @@ Welcome to the Apache Arrow C++ implementation documentation! .. button-link:: examples/index.html :click-parent: - :color: secondary + :color: primary :expand: - Examples + To the Examples - .. grid-item-card:: Reference documentation + .. grid-item-card:: API Reference :class-card: contrib-card - :shadow: md + :shadow: none Explore Arrow’s API reference documentation @@ -89,10 +91,32 @@ Welcome to the Apache Arrow C++ implementation documentation! .. button-link:: api.html :click-parent: - :color: secondary + :color: primary + :expand: + + To the API Reference + +.. grid:: 1 + :gutter: 4 + :padding: 2 2 0 0 + :class-container: sd-text-center + + .. grid-item-card:: Cookbook + :class-card: contrib-card + :shadow: none + + Collection of recipes which demonstrate how to + solve many common tasks that users might need + to perform when working with arrow data + + +++ + + .. button-link:: https://arrow.apache.org/cookbook/cpp/ + :click-parent: + :color: primary :expand: - API Reference + To the Cookbook .. toctree:: :maxdepth: 2 @@ -102,3 +126,4 @@ Welcome to the Apache Arrow C++ implementation documentation! user_guide Examples api + C++ cookbook diff --git a/docs/source/developers/continuous_integration/index.rst b/docs/source/developers/continuous_integration/index.rst index 6e8e26981c549..f988b5ab69d50 100644 --- a/docs/source/developers/continuous_integration/index.rst +++ b/docs/source/developers/continuous_integration/index.rst @@ -15,6 +15,7 @@ .. specific language governing permissions and limitations .. under the License. +.. _continuous_integration: ********************** Continuous Integration diff --git a/docs/source/developers/continuous_integration/overview.rst b/docs/source/developers/continuous_integration/overview.rst index 1d82e845a3360..3e155bf6001e9 100644 --- a/docs/source/developers/continuous_integration/overview.rst +++ b/docs/source/developers/continuous_integration/overview.rst @@ -20,7 +20,7 @@ Continuous Integration ====================== -Continuous Integration for Arrow is fairly complex as it needs to run across different combinations of package managers, compilers, versions of multiple sofware libraries, operating systems, and other potential sources of variation. In this article, we will give an overview of its main components and the relevant files and directories. +Continuous Integration for Arrow is fairly complex as it needs to run across different combinations of package managers, compilers, versions of multiple software libraries, operating systems, and other potential sources of variation. In this article, we will give an overview of its main components and the relevant files and directories. Some files central to Arrow CI are: diff --git a/docs/source/developers/contributing.rst b/docs/source/developers/contributing.rst deleted file mode 100644 index 6dc2a4e0147d6..0000000000000 --- a/docs/source/developers/contributing.rst +++ /dev/null @@ -1,190 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. highlight:: console - -.. _contributing: - -**************************** -Contributing to Apache Arrow -**************************** - -**Thanks for your interest in the Apache Arrow project.** - -Arrow is a large project and may seem overwhelming when you're -first getting involved. Contributing code is great, but that's -probably not the first place to start. There are lots of ways to -make valuable contributions to the project and community. - -This page provides some orientation for how to get involved. It also offers -some recommendations on how to get the best results when engaging with the -community. - -Code of Conduct -=============== - -All participation in the Apache Arrow project is governed by the ASF's -`Code of Conduct `_. - -.. grid:: 2 - :gutter: 4 - :padding: 2 2 0 0 - :class-container: sd-text-center - - .. grid-item-card:: Community - :img-top: ./images/users-solid.svg - :class-card: contrib-card - :shadow: md - - A good first step to getting involved in the Arrow project is to join - the mailing lists and participate in discussions where you can. - - +++ - - .. button-link:: https://arrow.apache.org/community/ - :click-parent: - :color: secondary - :expand: - - Apache Arrow Community - - .. grid-item-card:: Bug reports - :img-top: ./images/bug-solid.svg - :class-card: contrib-card - :shadow: md - - Alerting us to unexpected behavior and missing features, even - if you can't solve the problems yourself, help us understand - and prioritize work to improve the libraries. - - +++ - - .. button-ref:: bug-reports - :ref-type: ref - :click-parent: - :color: secondary - :expand: - - Bugs and Features - -.. dropdown:: Communicating through the mailing lists - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - Projects in The Apache Software Foundation ("the ASF") use public, archived - mailing lists to create a public record of each project's development - activities and decision-making process. - - While lacking the immediacy of chat or other forms of communication, - the mailing lists give participants the opportunity to slow down and be - thoughtful in their responses, and they help developers who are spread across - many timezones to participate more equally. - - Read more on the `Apache Arrow Community `_ - page. - -.. dropdown:: Improve documentation - :animate: fade-in-slide-down - :class-title: sd-fs-5 - :class-container: sd-shadow-md - - A great way to contribute to the project is to improve documentation. If you - found some docs to be incomplete or inaccurate, share your hard-earned knowledge - with the rest of the community. - - Documentation improvements are also a great way to gain some experience with - our submission and review process, discussed below, without requiring a lot - of local development environment setup. In fact, many documentation-only changes - can be made directly in the GitHub web interface by clicking the "edit" button. - This will handle making a fork and a pull request for you. - - * :ref:`documentation` - * :ref:`building-docs` - -.. grid:: 2 - :gutter: 4 - :padding: 2 2 0 0 - :class-container: sd-text-center - - .. grid-item-card:: New Contributors - :img-top: ./images/book-open-solid.svg - :class-card: contrib-card - :shadow: md - - First time contributing? - - The New Contributor's Guide provides necessary information for - contributing to the Apache Arrow project. - - +++ - - .. button-ref:: guide-introduction - :ref-type: ref - :click-parent: - :color: secondary - :expand: - - New Contributor's guide - - .. grid-item-card:: Overview - :img-top: ./images/code-solid.svg - :class-card: contrib-card - :shadow: md - - A short overview of the contributing process we follow - and some additional information you might need if you are not - new to the contributing process in general. - +++ - - .. button-ref:: contrib-overview - :ref-type: ref - :click-parent: - :color: secondary - :expand: - - Contributing overview - -Language specific -================= - -Connection to the specific language development pages: - -.. tab-set:: - - .. tab-item:: C++ - - * :ref:`cpp-development` - * :ref:`C++ Development Guidelines ` - * :ref:`building-arrow-cpp` - - .. tab-item:: Java - - * :doc:`java/index` - - .. tab-item:: Python - - * :ref:`python-development` - - .. tab-item:: R - - * `Arrow R Package: Developer environment setup `_ - * `Arrow R Package: Common developer workflow tasks `_ - - .. tab-item:: Ruby - - * `Red Arrow - Apache Arrow Ruby `_ diff --git a/docs/source/developers/images/book-open-solid.svg b/docs/source/developers/images/book-open-solid.svg index cbc8ed27256ca..9586e249be060 100644 --- a/docs/source/developers/images/book-open-solid.svg +++ b/docs/source/developers/images/book-open-solid.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/source/developers/images/bug-solid.svg b/docs/source/developers/images/bug-solid.svg index f842cb240544f..49cc04a1f0f6e 100644 --- a/docs/source/developers/images/bug-solid.svg +++ b/docs/source/developers/images/bug-solid.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/source/developers/images/code-solid.svg b/docs/source/developers/images/code-solid.svg index 725f767148b2c..4bbd567528ef8 100644 --- a/docs/source/developers/images/code-solid.svg +++ b/docs/source/developers/images/code-solid.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/source/developers/images/users-solid.svg b/docs/source/developers/images/users-solid.svg index a04d7fe2fd4a0..4bdf638a70f89 100644 --- a/docs/source/developers/images/users-solid.svg +++ b/docs/source/developers/images/users-solid.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst index f6a15a6c5452e..c2f10c9e95c47 100644 --- a/docs/source/developers/index.rst +++ b/docs/source/developers/index.rst @@ -15,12 +15,206 @@ .. specific language governing permissions and limitations .. under the License. +.. highlight:: console + +.. _developers: + +Development +=========== + +Connection to the specific language development pages: + +.. tab-set:: + + .. tab-item:: C++ + + * :ref:`cpp-development` + * :ref:`C++ Development Guidelines ` + * :ref:`building-arrow-cpp` + + .. tab-item:: Java + + * :doc:`java/index` + + .. tab-item:: Python + + * :ref:`python-development` + + .. tab-item:: R + + * `Arrow R Package: Developer environment setup `_ + * `Arrow R Package: Common developer workflow tasks `_ + + .. tab-item:: Ruby + + * `Red Arrow - Apache Arrow Ruby `_ + +.. _contributing: + +Contributing to Apache Arrow +============================ + +**Thanks for your interest in the Apache Arrow project.** + +Arrow is a large project and may seem overwhelming when you're +first getting involved. Contributing code is great, but that's +probably not the first place to start. There are lots of ways to +make valuable contributions to the project and community. + +This page provides some orientation for how to get involved. It also offers +some recommendations on how to get the best results when engaging with the +community. + +Code of Conduct +--------------- + +All participation in the Apache Arrow project is governed by the ASF's +`Code of Conduct `_. + +.. grid:: 2 + :gutter: 4 + :padding: 2 2 0 0 + :class-container: sd-text-center + + .. grid-item-card:: Apache Arrow Community + :img-top: ./images/users-solid.svg + :class-card: contrib-card + :shadow: none + + A good first step to getting involved in the Arrow project is to join + the mailing lists and participate in discussions where you can. + + +++ + + .. button-link:: https://arrow.apache.org/community/ + :click-parent: + :color: primary + :expand: + + To Apache Arrow Community + + .. grid-item-card:: Bug reports and feature requests + :img-top: ./images/bug-solid.svg + :class-card: contrib-card + :shadow: none + + Alerting us to unexpected behavior and missing features, even + if you can't solve the problems yourself, help us understand + and prioritize work to improve the libraries. + + +++ + + .. button-ref:: bug-reports + :ref-type: ref + :click-parent: + :color: primary + :expand: + + To Bug reports and feature requests + +.. dropdown:: Communicating through the mailing lists + :animate: fade-in-slide-down + :class-title: sd-fs-5 + :class-container: sd-shadow-none + + Projects in The Apache Software Foundation ("the ASF") use public, archived + mailing lists to create a public record of each project's development + activities and decision-making process. + + While lacking the immediacy of chat or other forms of communication, + the mailing lists give participants the opportunity to slow down and be + thoughtful in their responses, and they help developers who are spread across + many timezones to participate more equally. + + Read more on the `Apache Arrow Community `_ + page. + +.. dropdown:: Improve documentation + :animate: fade-in-slide-down + :class-title: sd-fs-5 + + A great way to contribute to the project is to improve documentation. If you + found some docs to be incomplete or inaccurate, share your hard-earned knowledge + with the rest of the community. + + Documentation improvements are also a great way to gain some experience with + our submission and review process, discussed below, without requiring a lot + of local development environment setup. In fact, many documentation-only changes + can be made directly in the GitHub web interface by clicking the "edit" button. + This will handle making a fork and a pull request for you. + + * :ref:`documentation` + * :ref:`building-docs` + +.. grid:: 2 + :gutter: 4 + :padding: 2 2 0 0 + :class-container: sd-text-center + + .. grid-item-card:: New Contributor's guide + :img-top: ./images/book-open-solid.svg + :class-card: contrib-card + + First time contributing? + + The New Contributor's Guide provides necessary information for + contributing to the Apache Arrow project. + + +++ + + .. button-ref:: guide-introduction + :ref-type: ref + :click-parent: + :color: primary + :expand: + + To the New Contributor's guide + + .. grid-item-card:: Contributing Overview + :img-top: ./images/code-solid.svg + :class-card: contrib-card + + A short overview of the contributing process we follow + and some additional information you might need if you are not + new to the contributing process in general. + +++ + + .. button-ref:: contrib-overview + :ref-type: ref + :click-parent: + :color: primary + :expand: + + To Contributing overview + +.. dropdown:: Continuous Integration + :animate: fade-in-slide-down + :class-title: sd-fs-5 + :class-container: sd-shadow-none + + Continuous Integration needs to run across different combinations of package managers, compilers, versions of multiple + software libraries, operating systems, and other potential sources of variation. + + Read more on the :ref:`continuous_integration` page. + +.. dropdown:: Benchmarks + :animate: fade-in-slide-down + :class-title: sd-fs-5 + :class-container: sd-shadow-none + + How to use the benchmark suite can be found on the :ref:`benchmarks` page. + +.. dropdown:: Release Guide + :animate: fade-in-slide-down + :class-title: sd-fs-5 + :class-container: sd-shadow-none + + To learn about the detailed information on the steps followed to perform a release, see :ref:`release`. + .. toctree:: :maxdepth: 2 - :caption: Development :hidden: - contributing bug_reports guide/index overview diff --git a/docs/source/developers/java/development.rst b/docs/source/developers/java/development.rst index 1094d02f1c140..ce7e1704f641c 100644 --- a/docs/source/developers/java/development.rst +++ b/docs/source/developers/java/development.rst @@ -84,11 +84,13 @@ UI Benchmark: Integration Testing =================== -Integration tests can be run via Archery: +Integration tests can be run :ref:`via Archery `. +For example, assuming you only built Arrow Java and want to run the IPC +integration tests, you would do: -.. code-block:: +.. code-block:: console - $ archery integration --with-java true --with-cpp false --with-js false --with-csharp false --with-go false --with-rust false + $ archery integration --run-ipc --with-java 1 Code Style ========== @@ -104,4 +106,4 @@ This checks the code style of all source code under the current directory or fro .. _benchmark: https://github.com/ursacomputing/benchmarks .. _archery: https://github.com/apache/arrow/blob/main/dev/conbench_envs/README.md#L188 .. _conbench: https://github.com/conbench/conbench -.. _checkstyle: https://github.com/apache/arrow/blob/main/java/dev/checkstyle/checkstyle.xml \ No newline at end of file +.. _checkstyle: https://github.com/apache/arrow/blob/main/java/dev/checkstyle/checkstyle.xml diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index 272f3dbd98074..c7bc4273313bc 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -45,7 +45,7 @@ checklist for using ``git``: .. dropdown:: How to squash local commits? :animate: fade-in-slide-down - :class-container: sd-shadow-md + :class-container: sd-shadow-none Abort the rebase with: @@ -78,7 +78,7 @@ checklist for using ``git``: .. dropdown:: Setting rebase to be default :animate: fade-in-slide-down - :class-container: sd-shadow-md + :class-container: sd-shadow-none If you set the following in your repo's ``.git/config``, the ``--rebase`` option can be omitted from the ``git pull`` command, as it is implied by default. @@ -136,7 +136,7 @@ will merge the pull request. This is done with a .. dropdown:: Details on squash merge :animate: fade-in-slide-down - :class-container: sd-shadow-md + :class-container: sd-shadow-none A pull request is merged with a squash merge so that all of your commits will be registered as a single commit to the main branch; this simplifies the diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 066400b33ffb5..6924c2d714e8b 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _release: + ======================== Release Management Guide ======================== diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 5f2341b9c469c..e1160b287e77c 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -20,32 +20,98 @@ Integration Testing =================== +To ensure Arrow implementations are interoperable between each other, +the Arrow project includes cross-language integration tests which are +regularly run as Continuous Integration tasks. + +The integration tests exercise compliance with several Arrow specifications: +the :ref:`IPC format `, the :ref:`Flight RPC ` protocol, +and the :ref:`C Data Interface `. + +Strategy +-------- + Our strategy for integration testing between Arrow implementations is: -* Test datasets are specified in a custom human-readable, JSON-based format - designed exclusively for Arrow's integration tests -* Each implementation provides a testing executable capable of converting - between the JSON and the binary Arrow file representation -* Each testing executable is used to generate binary Arrow file representations - from the JSON-based test datasets. These results are then used to call the - testing executable of each other implementation to validate the contents - against the corresponding JSON file. - - *ie.* the C++ testing executable generates binary arrow files from JSON - specified datasets. The resulting files are then used as input to the Java - testing executable for validation, confirming that the Java implementation - can correctly read what the C++ implementation wrote. +* Test datasets are specified in a custom human-readable, + :ref:`JSON-based format ` designed exclusively + for Arrow's integration tests. + +* The JSON files are generated by the integration test harness. Different + files are used to represent different data types and features, such as + numerics, lists, dictionary encoding, etc. This makes it easier to pinpoint + incompatibilities than if all data types were represented in a single file. + +* Each implementation provides entry points capable of converting + between the JSON and the Arrow in-memory representation, and of exposing + Arrow in-memory data using the desired format. + +* Each format (whether Arrow IPC, Flight or the C Data Interface) is tested for + all supported pairs of (producer, consumer) implementations. The producer + typically reads a JSON file, converts it to in-memory Arrow data, and exposes + this data using the format under test. The consumer reads the data in the + said format and converts it back to Arrow in-memory data; it also reads + the same JSON file as the producer, and validates that both datasets are + identical. + +Example: IPC format +~~~~~~~~~~~~~~~~~~~ + +Let's say we are testing Arrow C++ as a producer and Arrow Java as a consumer +of the Arrow IPC format. Testing a JSON file would go as follows: + +#. A C++ executable reads the JSON file, converts it into Arrow in-memory data + and writes an Arrow IPC file (the file paths are typically given on the command + line). + +#. A Java executable reads the JSON file, converts it into Arrow in-memory data; + it also reads the Arrow IPC file generated by C++. Finally, it validates that + both Arrow in-memory datasets are equal. + +Example: C Data Interface +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now, let's say we are testing Arrow Go as a producer and Arrow C# as a consumer +of the Arrow C Data Interface. + +#. The integration testing harness allocates a C + :ref:`ArrowArray ` structure on the heap. + +#. A Go in-process entrypoint (for example a C-compatible function call) + reads a JSON file and exports one of its :term:`record batches ` + into the ``ArrowArray`` structure. + +#. A C# in-process entrypoint reads the same JSON file, converts the + same record batch into Arrow in-memory data; it also imports the + record batch exported by Arrow Go in the ``ArrowArray`` structure. + It validates that both record batches are equal, and then releases the + imported record batch. + +#. Depending on the implementation languages' abilities, the integration + testing harness may assert that memory consumption remained identical + (i.e., that the exported record batch didn't leak). + +#. At the end, the integration testing harness deallocates the ``ArrowArray`` + structure. + +.. _running_integration_tests: Running integration tests ------------------------- The integration test data generator and runner are implemented inside -the :ref:`Archery ` utility. +the :ref:`Archery ` utility. You need to install the ``integration`` +component of archery: + +.. code:: console + + $ pip install -e "dev/archery[integration]" The integration tests are run using the ``archery integration`` command. -.. code-block:: shell +.. code-block:: console - archery integration --help + $ archery integration --help In order to run integration tests, you'll first need to build each component you want to include. See the respective developer docs for C++, Java, etc. @@ -56,26 +122,26 @@ testing. For C++, for example, you need to add ``-DARROW_BUILD_INTEGRATION=ON`` to your cmake command. Depending on which components you have built, you can enable and add them to -the archery test run. For example, if you only have the C++ project built, run: +the archery test run. For example, if you only have the C++ project built +and want to run the Arrow IPC integration tests, run: .. code-block:: shell - archery integration --with-cpp=1 - + archery integration --run-ipc --with-cpp=1 For Java, it may look like: .. code-block:: shell - VERSION=0.11.0-SNAPSHOT + VERSION=14.0.0-SNAPSHOT export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar - archery integration --with-cpp=1 --with-java=1 + archery integration --run-ipc --with-cpp=1 --with-java=1 -To run all tests, including Flight integration tests, do: +To run all tests, including Flight and C Data Interface integration tests, do: .. code-block:: shell - archery integration --with-all --run-flight + archery integration --with-all --run-flight --run-ipc --run-c-data Note that we run these tests in continuous integration, and the CI job uses docker-compose. You may also run the docker-compose job locally, or at least @@ -85,6 +151,8 @@ certain tests. See :ref:`docker-builds` for more information about the project's ``docker-compose`` configuration. +.. _format_json_integration: + JSON test data format --------------------- @@ -415,7 +483,7 @@ will have count 28. For "null" type, ``BufferData`` does not contain any buffers. Archery Integration Test Cases --------------------------------------- +------------------------------ This list can make it easier to understand what manual testing may need to be done for any future Arrow Format changes by knowing what cases the automated diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index 1771b36d76128..ae2baf128b472 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -15,10 +15,13 @@ .. specific language governing permissions and limitations .. under the License. +.. _format: + +Specifications and Protocols +============================ + .. toctree:: :maxdepth: 2 - :caption: Specifications and Protocols - :hidden: Versioning Columnar diff --git a/docs/source/index.rst b/docs/source/index.rst index b348d3dab22b7..e8cdf50c5b1ec 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +:html_theme.sidebar_secondary.remove: + Apache Arrow ============ @@ -35,11 +37,71 @@ such topics as: **To learn how to use Arrow refer to the documentation specific to your target environment.** +.. grid:: 2 + :gutter: 4 + :padding: 2 2 0 0 + :class-container: sd-text-center + + .. grid-item-card:: Specifications and Protocols + :class-card: contrib-card + :shadow: none + + Read about the Apache Arrow format + specifications and Protocols. + + +++ + + .. button-ref:: format + :ref-type: ref + :click-parent: + :color: primary + :expand: + + To the Specifications and Protocols + + .. grid-item-card:: Development + :class-card: contrib-card + :shadow: none + + Find the documentation on the topic of + contributions, reviews, building of the libraries + from source, building of the documentation, + continuous integration, benchmarks and the + release process. + + +++ + + .. button-ref:: developers + :ref-type: ref + :click-parent: + :color: primary + :expand: + + To the Development + +.. _toc.columnar: + +.. toctree:: + :maxdepth: 2 + :hidden: + + format/index + +.. _toc.development: + +.. toctree:: + :maxdepth: 2 + :hidden: + + developers/index + +Implementations +--------------- + .. _toc.usage: .. toctree:: :maxdepth: 1 - :caption: Supported Environments C/GLib C++ @@ -55,52 +117,15 @@ target environment.** Rust status +Examples +-------- + .. _toc.cookbook: .. toctree:: :maxdepth: 1 - :caption: Cookbooks - - C++ - Java - Python - R -.. _toc.columnar: - -.. toctree:: - :maxdepth: 2 - :caption: Specifications and Protocols - - format/Versioning - format/Columnar - format/CanonicalExtensions - format/Flight - format/FlightSql - format/Integration - format/CDataInterface - format/CStreamInterface - format/CDeviceDataInterface - format/ADBC - format/Other - format/Changing - format/Glossary - -.. _toc.development: - -.. toctree:: - :maxdepth: 2 - :caption: Development - - developers/contributing - developers/bug_reports - developers/guide/index - developers/overview - developers/reviewing - developers/cpp/index - developers/java/index - developers/python - developers/continuous_integration/index - developers/benchmarks - developers/documentation - developers/release + C++ cookbook + Java cookbook + Python cookbook + R cookbook diff --git a/docs/source/java/index.rst b/docs/source/java/index.rst index 9b555e297b0f9..cf93b0e897832 100644 --- a/docs/source/java/index.rst +++ b/docs/source/java/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _java: + Java Implementation =================== @@ -41,3 +43,4 @@ on the Arrow format and other language bindings see the :doc:`parent documentati cdata jdbc Reference (javadoc) + Java cookbook diff --git a/docs/source/js/index.rst b/docs/source/js/index.rst index 77813c1372dfe..2ab205a08b850 100644 --- a/docs/source/js/index.rst +++ b/docs/source/js/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _js: + JavaScript docs =============== diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 75d1e88970c29..0ce2ddd698414 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -216,7 +216,7 @@ value during the conversion. If an integer input is supplied to To handle better compatibility with Pandas, we support interpreting NaN values as null elements. This is enabled automatically on all ``from_pandas`` function and -can be enable on the other conversion functions by passing ``from_pandas=True`` +can be enabled on the other conversion functions by passing ``from_pandas=True`` as a function parameter. List arrays @@ -510,7 +510,7 @@ a new schema and cast the data to this schema: t2.schema.field("f1").metadata t2.schema.metadata -Metadata key and value pair are ``std::string`` objects in the C++ implementation +Metadata key and value pairs are ``std::string`` objects in the C++ implementation and so they are bytes objects (``b'...'``) in Python. Record Batch Readers diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index b80cbc7de594e..6a3de3d42b149 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -15,8 +15,13 @@ .. specific language governing permissions and limitations .. under the License. +.. _python: + +Python +====== + PyArrow - Apache Arrow Python bindings -====================================== +-------------------------------------- This is the documentation of the Python API of Apache Arrow. @@ -62,3 +67,4 @@ files into Arrow structures. api getting_involved benchmarks + Python cookbook diff --git a/docs/source/r/index.rst b/docs/source/r/index.rst index b799544bb6bb3..8ccbec132ad3d 100644 --- a/docs/source/r/index.rst +++ b/docs/source/r/index.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _r: + R docs ====== diff --git a/docs/source/status.rst b/docs/source/status.rst index 36c29fcdc4da6..e2b3852e2229f 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -46,7 +46,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Decimal128 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Decimal256 | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ | | +| Decimal256 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Date32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -54,7 +54,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Timestamp | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Duration | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Duration | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Interval | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -83,9 +83,9 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Map | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Dense Union | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +| Dense Union | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Sparse Union | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +| Sparse Union | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/format/Flight.proto b/format/Flight.proto index baf2733d81048..de3794f05ba83 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -366,6 +366,17 @@ message FlightInfo { * FlightEndpoints are in the same order as the data. */ bool ordered = 6; + + /* + * Application-defined metadata. + * + * There is no inherent or required relationship between this + * and the app_metadata fields in the FlightEndpoints or resulting + * FlightData messages. Since this metadata is application-defined, + * a given application could define there to be a relationship, + * but there is none required by the spec. + */ + bytes app_metadata = 7; } /* @@ -446,6 +457,17 @@ message FlightEndpoint { * application-defined whether DoGet requests may be retried. */ google.protobuf.Timestamp expiration_time = 3; + + /* + * Application-defined metadata. + * + * There is no inherent or required relationship between this + * and the app_metadata fields in the FlightInfo or resulting + * FlightData messages. Since this metadata is application-defined, + * a given application could define there to be a relationship, + * but there is none required by the spec. + */ + bytes app_metadata = 4; } /* diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go index d3648f0b1fbc4..7f01180f55957 100644 --- a/go/arrow/array/numericbuilder.gen.go +++ b/go/arrow/array/numericbuilder.gen.go @@ -86,7 +86,7 @@ func (b *Int64Builder) AppendEmptyValue() { } func (b *Int64Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -564,7 +564,7 @@ func (b *Float64Builder) AppendEmptyValue() { } func (b *Float64Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -803,7 +803,7 @@ func (b *Int32Builder) AppendEmptyValue() { } func (b *Int32Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -1042,7 +1042,7 @@ func (b *Uint32Builder) AppendEmptyValue() { } func (b *Uint32Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -1281,7 +1281,7 @@ func (b *Float32Builder) AppendEmptyValue() { } func (b *Float32Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -1520,7 +1520,7 @@ func (b *Int16Builder) AppendEmptyValue() { } func (b *Int16Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -1759,7 +1759,7 @@ func (b *Uint16Builder) AppendEmptyValue() { } func (b *Uint16Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -1998,7 +1998,7 @@ func (b *Int8Builder) AppendEmptyValue() { } func (b *Int8Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -2237,7 +2237,7 @@ func (b *Uint8Builder) AppendEmptyValue() { } func (b *Uint8Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -2477,7 +2477,7 @@ func (b *Time32Builder) AppendEmptyValue() { } func (b *Time32Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -2717,7 +2717,7 @@ func (b *Time64Builder) AppendEmptyValue() { } func (b *Time64Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -2956,7 +2956,7 @@ func (b *Date32Builder) AppendEmptyValue() { } func (b *Date32Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -3195,7 +3195,7 @@ func (b *Date64Builder) AppendEmptyValue() { } func (b *Date64Builder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } @@ -3435,7 +3435,7 @@ func (b *DurationBuilder) AppendEmptyValue() { } func (b *DurationBuilder) AppendEmptyValues(n int) { - for i := 0; i < n; i ++ { + for i := 0; i < n; i++ { b.AppendEmptyValue() } } diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go index bc8fc6e987b93..dc8825a7edb67 100644 --- a/go/arrow/cdata/cdata.go +++ b/go/arrow/cdata/cdata.go @@ -197,7 +197,7 @@ func importSchema(schema *CArrowSchema) (ret arrow.Field, err error) { // handle types with params via colon typs := strings.Split(f, ":") - defaulttz := "UTC" + defaulttz := "" switch typs[0] { case "tss": tz := typs[1] diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index ae6247494b100..187c2deb9755f 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -368,34 +368,36 @@ func exportArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema) { exportField(arrow.Field{Type: arr.DataType()}, outSchema) } + nbuffers := len(arr.Data().Buffers()) + buf_offset := 0 + // Some types don't have validity bitmaps, but we keep them shifted + // to make processing easier in other contexts. This means that + // we have to adjust when exporting. + has_validity_bitmap := internal.DefaultHasValidityBitmap(arr.DataType().ID()) + if nbuffers > 0 && !has_validity_bitmap { + nbuffers-- + buf_offset++ + } + out.dictionary = nil out.null_count = C.int64_t(arr.NullN()) out.length = C.int64_t(arr.Len()) out.offset = C.int64_t(arr.Data().Offset()) - out.n_buffers = C.int64_t(len(arr.Data().Buffers())) - - if out.n_buffers > 0 { - var ( - nbuffers = len(arr.Data().Buffers()) - bufs = arr.Data().Buffers() - ) - // unions don't have validity bitmaps, but we keep them shifted - // to make processing easier in other contexts. This means that - // we have to adjust for union arrays - if !internal.DefaultHasValidityBitmap(arr.DataType().ID()) { - out.n_buffers-- - nbuffers-- - bufs = bufs[1:] - } + out.n_buffers = C.int64_t(nbuffers) + out.buffers = nil + + if nbuffers > 0 { + bufs := arr.Data().Buffers() buffers := allocateBufferPtrArr(nbuffers) - for i := range bufs { - buf := bufs[i] + for i, buf := range bufs[buf_offset:] { if buf == nil || buf.Len() == 0 { - if i > 0 || !internal.DefaultHasValidityBitmap(arr.DataType().ID()) { + if i > 0 || !has_validity_bitmap { // apache/arrow#33936: export a dummy buffer to be friendly to // implementations that don't import NULL properly buffers[i] = (*C.void)(unsafe.Pointer(&C.kGoCdataZeroRegion)) } else { + // null pointer permitted for the validity bitmap + // (assuming null count is 0) buffers[i] = nil } continue diff --git a/go/arrow/cdata/cdata_test.go b/go/arrow/cdata/cdata_test.go index a0c2f25496a6b..af05649b1c541 100644 --- a/go/arrow/cdata/cdata_test.go +++ b/go/arrow/cdata/cdata_test.go @@ -184,13 +184,17 @@ func TestImportTemporalSchema(t *testing.T) { {arrow.FixedWidthTypes.MonthInterval, "tiM"}, {arrow.FixedWidthTypes.DayTimeInterval, "tiD"}, {arrow.FixedWidthTypes.MonthDayNanoInterval, "tin"}, - {arrow.FixedWidthTypes.Timestamp_s, "tss:"}, + {arrow.FixedWidthTypes.Timestamp_s, "tss:UTC"}, + {&arrow.TimestampType{Unit: arrow.Second}, "tss:"}, {&arrow.TimestampType{Unit: arrow.Second, TimeZone: "Europe/Paris"}, "tss:Europe/Paris"}, - {arrow.FixedWidthTypes.Timestamp_ms, "tsm:"}, + {arrow.FixedWidthTypes.Timestamp_ms, "tsm:UTC"}, + {&arrow.TimestampType{Unit: arrow.Millisecond}, "tsm:"}, {&arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "Europe/Paris"}, "tsm:Europe/Paris"}, - {arrow.FixedWidthTypes.Timestamp_us, "tsu:"}, + {arrow.FixedWidthTypes.Timestamp_us, "tsu:UTC"}, + {&arrow.TimestampType{Unit: arrow.Microsecond}, "tsu:"}, {&arrow.TimestampType{Unit: arrow.Microsecond, TimeZone: "Europe/Paris"}, "tsu:Europe/Paris"}, - {arrow.FixedWidthTypes.Timestamp_ns, "tsn:"}, + {arrow.FixedWidthTypes.Timestamp_ns, "tsn:UTC"}, + {&arrow.TimestampType{Unit: arrow.Nanosecond}, "tsn:"}, {&arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "Europe/Paris"}, "tsn:Europe/Paris"}, } diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index 7f62becdc2884..fc0b3aea56e70 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -347,7 +347,7 @@ type TemporalWithUnit interface { } // TimestampType is encoded as a 64-bit signed integer since the UNIX epoch (2017-01-01T00:00:00Z). -// The zero-value is a nanosecond and time zone neutral. Time zone neutral can be +// The zero-value is a second and time zone neutral. Time zone neutral can be // considered UTC without having "UTC" as a time zone. type TimestampType struct { Unit TimeUnit diff --git a/go/arrow/flight/gen.go b/go/arrow/flight/gen.go index cfdd0e036703a..29ae54b38f427 100644 --- a/go/arrow/flight/gen.go +++ b/go/arrow/flight/gen.go @@ -16,5 +16,5 @@ package flight -//go:generate protoc -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto +//go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto //go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative FlightSql.proto diff --git a/go/arrow/flight/gen/flight/Flight.pb.go b/go/arrow/flight/gen/flight/Flight.pb.go index 861b37d90d088..0438bca28be50 100644 --- a/go/arrow/flight/gen/flight/Flight.pb.go +++ b/go/arrow/flight/gen/flight/Flight.pb.go @@ -18,7 +18,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v4.23.4 // source: Flight.proto package flight @@ -38,6 +38,7 @@ const ( _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) +// // The result of a cancel operation. // // This is used by CancelFlightInfoResult.status. @@ -102,17 +103,19 @@ func (CancelStatus) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{0} } +// // Describes what type of descriptor is defined. type FlightDescriptor_DescriptorType int32 const ( // Protobuf pattern, not used. FlightDescriptor_UNKNOWN FlightDescriptor_DescriptorType = 0 + // // A named path that identifies a dataset. A path is composed of a string // or list of strings describing a particular dataset. This is conceptually - // - // similar to a path inside a filesystem. + // similar to a path inside a filesystem. FlightDescriptor_PATH FlightDescriptor_DescriptorType = 1 + // // An opaque command to generate a dataset. FlightDescriptor_CMD FlightDescriptor_DescriptorType = 2 ) @@ -158,14 +161,17 @@ func (FlightDescriptor_DescriptorType) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{12, 0} } +// // The request that a client provides to a server on handshake. type HandshakeRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -221,8 +227,10 @@ type HandshakeResponse struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -273,6 +281,7 @@ func (x *HandshakeResponse) GetPayload() []byte { return nil } +// // A message for doing simple auth. type BasicAuth struct { state protoimpl.MessageState @@ -367,6 +376,7 @@ func (*Empty) Descriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{3} } +// // Describes an available action, including both the name used for execution // along with a short description of the purpose of the action. type ActionType struct { @@ -424,6 +434,7 @@ func (x *ActionType) GetDescription() string { return "" } +// // A service specific expression that can be used to return a limited set // of available Arrow Flight streams. type Criteria struct { @@ -473,6 +484,7 @@ func (x *Criteria) GetExpression() []byte { return nil } +// // An opaque action specific for the service. type Action struct { state protoimpl.MessageState @@ -529,6 +541,7 @@ func (x *Action) GetBody() []byte { return nil } +// // The request of the CancelFlightInfo action. // // The request should be stored in Action.body. @@ -579,6 +592,7 @@ func (x *CancelFlightInfoRequest) GetInfo() *FlightInfo { return nil } +// // The request of the RenewFlightEndpoint action. // // The request should be stored in Action.body. @@ -629,6 +643,7 @@ func (x *RenewFlightEndpointRequest) GetEndpoint() *FlightEndpoint { return nil } +// // An opaque result returned after executing an action. type Result struct { state protoimpl.MessageState @@ -677,6 +692,7 @@ func (x *Result) GetBody() []byte { return nil } +// // The result of the CancelFlightInfo action. // // The result should be stored in Result.body. @@ -727,6 +743,7 @@ func (x *CancelFlightInfoResult) GetStatus() CancelStatus { return CancelStatus_CANCEL_STATUS_UNSPECIFIED } +// // Wrap the result of a getSchema call type SchemaResult struct { state protoimpl.MessageState @@ -734,10 +751,9 @@ type SchemaResult struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` } @@ -780,6 +796,7 @@ func (x *SchemaResult) GetSchema() []byte { return nil } +// // The name or tag for a Flight. May be used as a way to retrieve or generate // a flight or be used to expose a set of previously defined flights. type FlightDescriptor struct { @@ -788,9 +805,11 @@ type FlightDescriptor struct { unknownFields protoimpl.UnknownFields Type FlightDescriptor_DescriptorType `protobuf:"varint,1,opt,name=type,proto3,enum=arrow.flight.protocol.FlightDescriptor_DescriptorType" json:"type,omitempty"` + // // Opaque value used to express a command. Should only be defined when // type = CMD. Cmd []byte `protobuf:"bytes,2,opt,name=cmd,proto3" json:"cmd,omitempty"` + // // List of strings identifying a particular dataset. Should only be defined // when type = PATH. Path []string `protobuf:"bytes,3,rep,name=path,proto3" json:"path,omitempty"` @@ -849,6 +868,7 @@ func (x *FlightDescriptor) GetPath() []string { return nil } +// // The access coordinates for retrieval of a dataset. With a FlightInfo, a // consumer is able to determine how to retrieve a dataset. type FlightInfo struct { @@ -857,13 +877,14 @@ type FlightInfo struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` + // // The descriptor associated with this info. FlightDescriptor *FlightDescriptor `protobuf:"bytes,2,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // A list of endpoints associated with the flight. To consume the // whole flight, all endpoints (and hence all Tickets) must be // consumed. Endpoints can be consumed in any order. @@ -883,15 +904,19 @@ type FlightInfo struct { // ordering is important for an application, an application must // choose one of them: // - // - An application requires that all clients must read data in - // returned endpoints order. - // - An application must return the all data in a single endpoint. + // * An application requires that all clients must read data in + // returned endpoints order. + // * An application must return the all data in a single endpoint. Endpoint []*FlightEndpoint `protobuf:"bytes,3,rep,name=endpoint,proto3" json:"endpoint,omitempty"` // Set these to -1 if unknown. TotalRecords int64 `protobuf:"varint,4,opt,name=total_records,json=totalRecords,proto3" json:"total_records,omitempty"` TotalBytes int64 `protobuf:"varint,5,opt,name=total_bytes,json=totalBytes,proto3" json:"total_bytes,omitempty"` + // // FlightEndpoints are in the same order as the data. Ordered bool `protobuf:"varint,6,opt,name=ordered,proto3" json:"ordered,omitempty"` + // + // Application-defined metadata. + AppMetadata []byte `protobuf:"bytes,7,opt,name=app_metadata,json=appMetadata,proto3" json:"app_metadata,omitempty"` } func (x *FlightInfo) Reset() { @@ -968,12 +993,21 @@ func (x *FlightInfo) GetOrdered() bool { return false } +func (x *FlightInfo) GetAppMetadata() []byte { + if x != nil { + return x.AppMetadata + } + return nil +} + +// // The information to process a long-running query. type PollInfo struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // The currently available results. // // If "flight_descriptor" is not specified, the query is complete @@ -991,12 +1025,15 @@ type PollInfo struct { // ticket in the info before the query is // completed. FlightInfo.ordered is also valid. Info *FlightInfo `protobuf:"bytes,1,opt,name=info,proto3" json:"info,omitempty"` + // // The descriptor the client should use on the next try. // If unset, the query is complete. FlightDescriptor *FlightDescriptor `protobuf:"bytes,2,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // Query progress. If known, must be in [0.0, 1.0] but need not be // monotonic or nondecreasing. If unknown, do not set. Progress *float64 `protobuf:"fixed64,3,opt,name=progress,proto3,oneof" json:"progress,omitempty"` + // // Expiration time for this request. After this passes, the server // might not accept the retry descriptor anymore (and the query may // be cancelled). This may be updated on a call to PollFlightInfo. @@ -1063,14 +1100,17 @@ func (x *PollInfo) GetExpirationTime() *timestamppb.Timestamp { return nil } +// // A particular stream or split associated with a flight. type FlightEndpoint struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Token used to retrieve this stream. Ticket *Ticket `protobuf:"bytes,1,opt,name=ticket,proto3" json:"ticket,omitempty"` + // // A list of URIs where this ticket can be redeemed via DoGet(). // // If the list is empty, the expectation is that the ticket can only @@ -1086,10 +1126,14 @@ type FlightEndpoint struct { // In other words, an application can use multiple locations to // represent redundant and/or load balanced services. Location []*Location `protobuf:"bytes,2,rep,name=location,proto3" json:"location,omitempty"` + // // Expiration time of this stream. If present, clients may assume // they can retry DoGet requests. Otherwise, it is // application-defined whether DoGet requests may be retried. ExpirationTime *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=expiration_time,json=expirationTime,proto3" json:"expiration_time,omitempty"` + // + // Application-defined metadata. + AppMetadata []byte `protobuf:"bytes,4,opt,name=app_metadata,json=appMetadata,proto3" json:"app_metadata,omitempty"` } func (x *FlightEndpoint) Reset() { @@ -1145,6 +1189,14 @@ func (x *FlightEndpoint) GetExpirationTime() *timestamppb.Timestamp { return nil } +func (x *FlightEndpoint) GetAppMetadata() []byte { + if x != nil { + return x.AppMetadata + } + return nil +} + +// // A location where a Flight service will accept retrieval of a particular // stream given a ticket. type Location struct { @@ -1194,6 +1246,7 @@ func (x *Location) GetUri() string { return "" } +// // An opaque identifier that the service can use to retrieve a particular // portion of a stream. // @@ -1246,19 +1299,24 @@ func (x *Ticket) GetTicket() []byte { return nil } +// // A batch of Arrow data as part of a stream of batches. type FlightData struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // The descriptor of the data. This is only relevant when a client is // starting a new DoPut stream. FlightDescriptor *FlightDescriptor `protobuf:"bytes,1,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // Header for message data as described in Message.fbs::Message. DataHeader []byte `protobuf:"bytes,2,opt,name=data_header,json=dataHeader,proto3" json:"data_header,omitempty"` + // // Application-defined metadata. AppMetadata []byte `protobuf:"bytes,3,opt,name=app_metadata,json=appMetadata,proto3" json:"app_metadata,omitempty"` + // // The actual batch of Arrow data. Preferably handled with minimal-copies // coming last in the definition to help with sidecar patterns (it is // expected that some implementations will fetch this field off the wire @@ -1326,7 +1384,7 @@ func (x *FlightData) GetDataBody() []byte { return nil } -// * +//* // The response message associated with the submission of a DoPut. type PutResult struct { state protoimpl.MessageState @@ -1441,7 +1499,7 @@ var file_Flight_proto_rawDesc = []byte{ 0x22, 0x30, 0x0a, 0x0e, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x50, 0x41, 0x54, 0x48, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x43, 0x4d, 0x44, - 0x10, 0x02, 0x22, 0x9d, 0x02, 0x0a, 0x0a, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, + 0x10, 0x02, 0x22, 0xc0, 0x02, 0x0a, 0x0a, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x12, 0x54, 0x0a, 0x11, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x18, 0x02, @@ -1459,129 +1517,133 @@ var file_Flight_proto_rawDesc = []byte{ 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0a, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x65, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x6f, 0x72, 0x64, 0x65, 0x72, - 0x65, 0x64, 0x22, 0x8a, 0x02, 0x0a, 0x08, 0x50, 0x6f, 0x6c, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x12, - 0x35, 0x0a, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, - 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, - 0x52, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x12, 0x54, 0x0a, 0x11, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x5f, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x0b, 0x32, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x52, 0x10, 0x66, 0x6c, 0x69, 0x67, - 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x12, 0x1f, 0x0a, 0x08, - 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x01, 0x48, 0x00, - 0x52, 0x08, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x88, 0x01, 0x01, 0x12, 0x43, 0x0a, - 0x0f, 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x74, 0x69, 0x6d, 0x65, - 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, - 0x6d, 0x70, 0x52, 0x0e, 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x69, - 0x6d, 0x65, 0x42, 0x0b, 0x0a, 0x09, 0x5f, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x22, - 0xc9, 0x01, 0x0a, 0x0e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, - 0x6e, 0x74, 0x12, 0x35, 0x0a, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x18, 0x01, 0x20, 0x01, - 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x54, 0x69, 0x63, 0x6b, 0x65, - 0x74, 0x52, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x12, 0x3b, 0x0a, 0x08, 0x6c, 0x6f, 0x63, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1f, 0x2e, 0x61, 0x72, + 0x65, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, + 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, 0x70, 0x70, 0x4d, 0x65, 0x74, + 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x8a, 0x02, 0x0a, 0x08, 0x50, 0x6f, 0x6c, 0x6c, 0x49, 0x6e, + 0x66, 0x6f, 0x12, 0x35, 0x0a, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, + 0x32, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, + 0x6e, 0x66, 0x6f, 0x52, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x12, 0x54, 0x0a, 0x11, 0x66, 0x6c, 0x69, + 0x67, 0x68, 0x74, 0x5f, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, + 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, + 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x52, 0x10, 0x66, + 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x12, + 0x1f, 0x0a, 0x08, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x01, 0x48, 0x00, 0x52, 0x08, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x88, 0x01, 0x01, + 0x12, 0x43, 0x0a, 0x0f, 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x74, + 0x69, 0x6d, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, + 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, + 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x0e, 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x42, 0x0b, 0x0a, 0x09, 0x5f, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, + 0x73, 0x73, 0x22, 0xec, 0x01, 0x0a, 0x0e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x45, 0x6e, 0x64, + 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x12, 0x35, 0x0a, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, + 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x54, 0x69, + 0x63, 0x6b, 0x65, 0x74, 0x52, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x12, 0x3b, 0x0a, 0x08, + 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1f, + 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x4c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, + 0x08, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x43, 0x0a, 0x0f, 0x65, 0x78, 0x70, + 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x18, 0x03, 0x20, 0x01, + 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x0e, + 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x12, 0x21, + 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0x04, + 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, 0x70, 0x70, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, + 0x61, 0x22, 0x1c, 0x0a, 0x08, 0x4c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x10, 0x0a, + 0x03, 0x75, 0x72, 0x69, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x75, 0x72, 0x69, 0x22, + 0x20, 0x0a, 0x06, 0x54, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x74, 0x69, 0x63, + 0x6b, 0x65, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, + 0x74, 0x22, 0xc4, 0x01, 0x0a, 0x0a, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, + 0x12, 0x54, 0x0a, 0x11, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x65, 0x73, 0x63, 0x72, + 0x69, 0x70, 0x74, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x63, 0x6f, 0x6c, 0x2e, 0x4c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x6c, 0x6f, - 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x43, 0x0a, 0x0f, 0x65, 0x78, 0x70, 0x69, 0x72, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, - 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, - 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x0e, 0x65, 0x78, 0x70, - 0x69, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x22, 0x1c, 0x0a, 0x08, 0x4c, - 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x10, 0x0a, 0x03, 0x75, 0x72, 0x69, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x75, 0x72, 0x69, 0x22, 0x20, 0x0a, 0x06, 0x54, 0x69, 0x63, - 0x6b, 0x65, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x0c, 0x52, 0x06, 0x74, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x22, 0xc4, 0x01, 0x0a, 0x0a, - 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x54, 0x0a, 0x11, 0x66, 0x6c, - 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, - 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, - 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x52, 0x10, - 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, - 0x12, 0x1f, 0x0a, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, - 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x64, 0x61, 0x74, 0x61, 0x48, 0x65, 0x61, 0x64, 0x65, - 0x72, 0x12, 0x21, 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, 0x70, 0x70, 0x4d, 0x65, 0x74, 0x61, - 0x64, 0x61, 0x74, 0x61, 0x12, 0x1c, 0x0a, 0x09, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x62, 0x6f, 0x64, - 0x79, 0x18, 0xe8, 0x07, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, 0x42, 0x6f, - 0x64, 0x79, 0x22, 0x2e, 0x0a, 0x09, 0x50, 0x75, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, - 0x21, 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, 0x70, 0x70, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, - 0x74, 0x61, 0x2a, 0x8b, 0x01, 0x0a, 0x0c, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x53, 0x74, 0x61, - 0x74, 0x75, 0x73, 0x12, 0x1d, 0x0a, 0x19, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, - 0x41, 0x54, 0x55, 0x53, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, - 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, - 0x54, 0x55, 0x53, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x45, 0x44, 0x10, 0x01, 0x12, - 0x1c, 0x0a, 0x18, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, - 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x49, 0x4e, 0x47, 0x10, 0x02, 0x12, 0x21, 0x0a, - 0x1d, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x4e, - 0x4f, 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x03, - 0x32, 0x85, 0x07, 0x0a, 0x0d, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x53, 0x65, 0x72, 0x76, 0x69, - 0x63, 0x65, 0x12, 0x64, 0x0a, 0x09, 0x48, 0x61, 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, 0x65, 0x12, + 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, + 0x70, 0x74, 0x6f, 0x72, 0x52, 0x10, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x12, 0x1f, 0x0a, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x68, + 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x64, 0x61, 0x74, + 0x61, 0x48, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x21, 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, + 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, + 0x70, 0x70, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x1c, 0x0a, 0x09, 0x64, 0x61, + 0x74, 0x61, 0x5f, 0x62, 0x6f, 0x64, 0x79, 0x18, 0xe8, 0x07, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x08, + 0x64, 0x61, 0x74, 0x61, 0x42, 0x6f, 0x64, 0x79, 0x22, 0x2e, 0x0a, 0x09, 0x50, 0x75, 0x74, 0x52, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x61, 0x70, 0x70, 0x5f, 0x6d, 0x65, 0x74, + 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0b, 0x61, 0x70, 0x70, + 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x2a, 0x8b, 0x01, 0x0a, 0x0c, 0x43, 0x61, 0x6e, + 0x63, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x1d, 0x0a, 0x19, 0x43, 0x41, 0x4e, + 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, + 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x43, 0x41, 0x4e, 0x43, + 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, + 0x4c, 0x45, 0x44, 0x10, 0x01, 0x12, 0x1c, 0x0a, 0x18, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, + 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x49, 0x4e, + 0x47, 0x10, 0x02, 0x12, 0x21, 0x0a, 0x1d, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x53, 0x54, + 0x41, 0x54, 0x55, 0x53, 0x5f, 0x4e, 0x4f, 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, + 0x41, 0x42, 0x4c, 0x45, 0x10, 0x03, 0x32, 0x85, 0x07, 0x0a, 0x0d, 0x46, 0x6c, 0x69, 0x67, 0x68, + 0x74, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x64, 0x0a, 0x09, 0x48, 0x61, 0x6e, 0x64, + 0x73, 0x68, 0x61, 0x6b, 0x65, 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, + 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x48, 0x61, + 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, + 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x48, 0x61, 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, 0x65, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x28, 0x01, 0x30, 0x01, 0x12, 0x55, + 0x0a, 0x0b, 0x4c, 0x69, 0x73, 0x74, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x73, 0x12, 0x1f, 0x2e, + 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x43, 0x72, 0x69, 0x74, 0x65, 0x72, 0x69, 0x61, 0x1a, 0x21, + 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, + 0x6f, 0x22, 0x00, 0x30, 0x01, 0x12, 0x5d, 0x0a, 0x0d, 0x47, 0x65, 0x74, 0x46, 0x6c, 0x69, 0x67, + 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, + 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, + 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x1a, + 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, + 0x66, 0x6f, 0x22, 0x00, 0x12, 0x5c, 0x0a, 0x0e, 0x50, 0x6f, 0x6c, 0x6c, 0x46, 0x6c, 0x69, 0x67, + 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, + 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, + 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x1a, + 0x1f, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x50, 0x6f, 0x6c, 0x6c, 0x49, 0x6e, 0x66, 0x6f, + 0x22, 0x00, 0x12, 0x5b, 0x0a, 0x09, 0x47, 0x65, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x48, 0x61, 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, - 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, + 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x1a, 0x23, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, - 0x2e, 0x48, 0x61, 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x28, 0x01, 0x30, 0x01, 0x12, 0x55, 0x0a, 0x0b, 0x4c, 0x69, 0x73, 0x74, - 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x73, 0x12, 0x1f, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, - 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, - 0x43, 0x72, 0x69, 0x74, 0x65, 0x72, 0x69, 0x61, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, + 0x2e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, + 0x4d, 0x0a, 0x05, 0x44, 0x6f, 0x47, 0x65, 0x74, 0x12, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, - 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x22, 0x00, 0x30, 0x01, 0x12, - 0x5d, 0x0a, 0x0d, 0x47, 0x65, 0x74, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, - 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, - 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, - 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, - 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x22, 0x00, 0x12, 0x5c, - 0x0a, 0x0e, 0x50, 0x6f, 0x6c, 0x6c, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x49, 0x6e, 0x66, 0x6f, - 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, + 0x2e, 0x54, 0x69, 0x63, 0x6b, 0x65, 0x74, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, + 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, + 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, 0x22, 0x00, 0x30, 0x01, 0x12, 0x52, + 0x0a, 0x05, 0x44, 0x6f, 0x50, 0x75, 0x74, 0x12, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, + 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, + 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, 0x1a, 0x20, 0x2e, 0x61, 0x72, 0x72, + 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, + 0x6f, 0x6c, 0x2e, 0x50, 0x75, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x28, 0x01, + 0x30, 0x01, 0x12, 0x58, 0x0a, 0x0a, 0x44, 0x6f, 0x45, 0x78, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, + 0x12, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, - 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x1a, 0x1f, 0x2e, 0x61, 0x72, 0x72, 0x6f, - 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, - 0x6c, 0x2e, 0x50, 0x6f, 0x6c, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x22, 0x00, 0x12, 0x5b, 0x0a, 0x09, - 0x47, 0x65, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x12, 0x27, 0x2e, 0x61, 0x72, 0x72, 0x6f, + 0x61, 0x74, 0x61, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, + 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, + 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, 0x22, 0x00, 0x28, 0x01, 0x30, 0x01, 0x12, 0x4c, 0x0a, 0x08, + 0x44, 0x6f, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, + 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, + 0x2e, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x1a, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, + 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, + 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x30, 0x01, 0x12, 0x52, 0x0a, 0x0b, 0x4c, 0x69, + 0x73, 0x74, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x1c, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, - 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, - 0x6f, 0x72, 0x1a, 0x23, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x53, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x4d, 0x0a, 0x05, 0x44, 0x6f, 0x47, - 0x65, 0x74, 0x12, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x54, 0x69, 0x63, 0x6b, 0x65, - 0x74, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x44, 0x61, 0x74, 0x61, 0x22, 0x00, 0x30, 0x01, 0x12, 0x52, 0x0a, 0x05, 0x44, 0x6f, 0x50, 0x75, - 0x74, 0x12, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x44, 0x61, 0x74, 0x61, 0x1a, 0x20, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, - 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x50, 0x75, 0x74, - 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x28, 0x01, 0x30, 0x01, 0x12, 0x58, 0x0a, 0x0a, - 0x44, 0x6f, 0x45, 0x78, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x12, 0x21, 0x2e, 0x61, 0x72, 0x72, - 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, - 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, 0x1a, 0x21, 0x2e, - 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x44, 0x61, 0x74, 0x61, - 0x22, 0x00, 0x28, 0x01, 0x30, 0x01, 0x12, 0x4c, 0x0a, 0x08, 0x44, 0x6f, 0x41, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x12, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x41, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x1a, 0x1d, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x22, 0x00, 0x30, 0x01, 0x12, 0x52, 0x0a, 0x0b, 0x4c, 0x69, 0x73, 0x74, 0x41, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x12, 0x1c, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, - 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x45, 0x6d, 0x70, 0x74, - 0x79, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x54, 0x79, 0x70, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x71, 0x0a, 0x1c, 0x6f, 0x72, 0x67, 0x2e, - 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, - 0x67, 0x68, 0x74, 0x2e, 0x69, 0x6d, 0x70, 0x6c, 0x5a, 0x32, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, - 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2f, 0x61, 0x72, 0x72, 0x6f, - 0x77, 0x2f, 0x67, 0x6f, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, - 0x74, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0xaa, 0x02, 0x1c, 0x41, - 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x41, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x46, 0x6c, 0x69, 0x67, - 0x68, 0x74, 0x2e, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x33, + 0x6c, 0x2e, 0x45, 0x6d, 0x70, 0x74, 0x79, 0x1a, 0x21, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, + 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, + 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x79, 0x70, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x71, + 0x0a, 0x1c, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x61, 0x72, 0x72, + 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x69, 0x6d, 0x70, 0x6c, 0x5a, 0x32, + 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, 0x70, 0x61, 0x63, 0x68, + 0x65, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2f, 0x67, 0x6f, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, + 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x66, 0x6c, 0x69, 0x67, + 0x68, 0x74, 0xaa, 0x02, 0x1c, 0x41, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x41, 0x72, 0x72, 0x6f, + 0x77, 0x2e, 0x46, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, + 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/go/arrow/flight/gen/flight/FlightSql.pb.go b/go/arrow/flight/gen/flight/FlightSql.pb.go index 95faa719a9488..494bf8bcca115 100644 --- a/go/arrow/flight/gen/flight/FlightSql.pb.go +++ b/go/arrow/flight/gen/flight/FlightSql.pb.go @@ -18,7 +18,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v4.23.4 // source: FlightSql.proto package flight @@ -48,27 +48,33 @@ const ( SqlInfo_FLIGHT_SQL_SERVER_VERSION SqlInfo = 1 // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. SqlInfo_FLIGHT_SQL_SERVER_ARROW_VERSION SqlInfo = 2 + // // Retrieves a boolean value indicating whether the Flight SQL Server is read only. // // Returns: // - false: if read-write // - true: if read only SqlInfo_FLIGHT_SQL_SERVER_READ_ONLY SqlInfo = 3 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // SQL queries. // // Note that the absence of this info (as opposed to a false value) does not necessarily // mean that SQL is not supported, as this property was not originally defined. SqlInfo_FLIGHT_SQL_SERVER_SQL SqlInfo = 4 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // Substrait plans. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT SqlInfo = 5 + // // Retrieves a string value indicating the minimum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION SqlInfo = 6 + // // Retrieves a string value indicating the maximum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION SqlInfo = 7 + // // Retrieves an int32 indicating whether the Flight SQL Server supports the // BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. // @@ -78,51 +84,61 @@ const ( // // The possible values are listed in `SqlSupportedTransaction`. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION SqlInfo = 8 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports explicit // query cancellation (the CancelQuery action). SqlInfo_FLIGHT_SQL_SERVER_CANCEL SqlInfo = 9 + // // Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT SqlInfo = 100 + // // Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT SqlInfo = 101 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. // // Returns: // - false: if it doesn't support CREATE and DROP of catalogs. // - true: if it supports CREATE and DROP of catalogs. SqlInfo_SQL_DDL_CATALOG SqlInfo = 500 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. // // Returns: // - false: if it doesn't support CREATE and DROP of schemas. // - true: if it supports CREATE and DROP of schemas. SqlInfo_SQL_DDL_SCHEMA SqlInfo = 501 + // // Indicates whether the Flight SQL Server supports CREATE and DROP of tables. // // Returns: // - false: if it doesn't support CREATE and DROP of tables. // - true: if it supports CREATE and DROP of tables. SqlInfo_SQL_DDL_TABLE SqlInfo = 502 + // // Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_IDENTIFIER_CASE SqlInfo = 503 // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. SqlInfo_SQL_IDENTIFIER_QUOTE_CHAR SqlInfo = 504 + // // Retrieves a int32 describing the case sensitivity of quoted identifiers. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_QUOTED_IDENTIFIER_CASE SqlInfo = 505 + // // Retrieves a boolean value indicating whether all tables are selectable. // // Returns: // - false: if not all tables are selectable or if none are; // - true: if all tables are selectable. SqlInfo_SQL_ALL_TABLES_ARE_SELECTABLE SqlInfo = 506 + // // Retrieves the null ordering. // // Returns a int32 ordinal for the null ordering being used, as described in @@ -138,15 +154,18 @@ const ( SqlInfo_SQL_SYSTEM_FUNCTIONS SqlInfo = 511 // Retrieves a UTF-8 string list with values of the supported datetime functions. SqlInfo_SQL_DATETIME_FUNCTIONS SqlInfo = 512 + // // Retrieves the UTF-8 string that can be used to escape wildcard characters. // This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern // (and therefore use one of the wildcard characters). // The '_' character represents any single character; the '%' character represents any sequence of zero or more // characters. SqlInfo_SQL_SEARCH_STRING_ESCAPE SqlInfo = 513 + // // Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names // (those beyond a-z, A-Z, 0-9 and _). SqlInfo_SQL_EXTRA_NAME_CHARACTERS SqlInfo = 514 + // // Retrieves a boolean value indicating whether column aliasing is supported. // If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns // as required. @@ -155,6 +174,7 @@ const ( // - false: if column aliasing is unsupported; // - true: if column aliasing is supported. SqlInfo_SQL_SUPPORTS_COLUMN_ALIASING SqlInfo = 515 + // // Retrieves a boolean value indicating whether concatenations between null and non-null values being // null are supported. // @@ -162,11 +182,13 @@ const ( // - false: if concatenations between null and non-null values being null are unsupported; // - true: if concatenations between null and non-null values being null are supported. SqlInfo_SQL_NULL_PLUS_NULL_IS_NULL SqlInfo = 516 + // // Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, // indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on // SqlSupportsConvert enum. // The returned map will be: map> SqlInfo_SQL_SUPPORTS_CONVERT SqlInfo = 517 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -174,6 +196,7 @@ const ( // - false: if table correlation names are unsupported; // - true: if table correlation names are supported. SqlInfo_SQL_SUPPORTS_TABLE_CORRELATION_NAMES SqlInfo = 518 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -181,12 +204,14 @@ const ( // - false: if different table correlation names are unsupported; // - true: if different table correlation names are supported SqlInfo_SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES SqlInfo = 519 + // // Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. // // Returns: // - false: if expressions in ORDER BY are unsupported; // - true: if expressions in ORDER BY are supported; SqlInfo_SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY SqlInfo = 520 + // // Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY // clause is supported. // @@ -194,6 +219,7 @@ const ( // - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; // - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. SqlInfo_SQL_SUPPORTS_ORDER_BY_UNRELATED SqlInfo = 521 + // // Retrieves the supported GROUP BY commands; // // Returns an int32 bitmask value representing the supported commands. @@ -206,18 +232,21 @@ const ( // - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. // Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlInfo_SQL_SUPPORTED_GROUP_BY SqlInfo = 522 + // // Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. // // Returns: // - false: if specifying a LIKE escape clause is unsupported; // - true: if specifying a LIKE escape clause is supported. SqlInfo_SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE SqlInfo = 523 + // // Retrieves a boolean value indicating whether columns may be defined as non-nullable. // // Returns: // - false: if columns cannot be defined as non-nullable; // - true: if columns may be defined as non-nullable. SqlInfo_SQL_SUPPORTS_NON_NULLABLE_COLUMNS SqlInfo = 524 + // // Retrieves the supported SQL grammar level as per the ODBC specification. // // Returns an int32 bitmask value representing the supported SQL grammar level. @@ -234,6 +263,7 @@ const ( // - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. // Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlInfo_SQL_SUPPORTED_GRAMMAR SqlInfo = 525 + // // Retrieves the supported ANSI92 SQL grammar level. // // Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. @@ -250,12 +280,14 @@ const ( // - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. // Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlInfo_SQL_ANSI92_SUPPORTED_LEVEL SqlInfo = 526 + // // Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. // // Returns: // - false: if the SQL Integrity Enhancement Facility is supported; // - true: if the SQL Integrity Enhancement Facility is supported. SqlInfo_SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY SqlInfo = 527 + // // Retrieves the support level for SQL OUTER JOINs. // // Returns a int32 ordinal for the SQL ordering being used, as described in @@ -265,14 +297,17 @@ const ( SqlInfo_SQL_SCHEMA_TERM SqlInfo = 529 // Retrieves a UTF-8 string with the preferred term for "procedure". SqlInfo_SQL_PROCEDURE_TERM SqlInfo = 530 + // // Retrieves a UTF-8 string with the preferred term for "catalog". // If a empty string is returned its assumed that the server does NOT supports catalogs. SqlInfo_SQL_CATALOG_TERM SqlInfo = 531 + // // Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. // // - false: if a catalog does not appear at the start of a fully qualified table name; // - true: if a catalog appears at the start of a fully qualified table name. SqlInfo_SQL_CATALOG_AT_START SqlInfo = 532 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL schema. @@ -289,6 +324,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_SCHEMAS_SUPPORTED_ACTIONS SqlInfo = 533 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL catalog. @@ -305,6 +341,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_CATALOGS_SUPPORTED_ACTIONS SqlInfo = 534 + // // Retrieves the supported SQL positioned commands. // // Returns an int32 bitmask value representing the supported SQL positioned commands. @@ -317,12 +354,14 @@ const ( // - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlInfo_SQL_SUPPORTED_POSITIONED_COMMANDS SqlInfo = 535 + // // Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. // // Returns: // - false: if SELECT FOR UPDATE statements are unsupported; // - true: if SELECT FOR UPDATE statements are supported. SqlInfo_SQL_SELECT_FOR_UPDATE_SUPPORTED SqlInfo = 536 + // // Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax // are supported. // @@ -330,6 +369,7 @@ const ( // - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; // - true: if stored procedure calls that use the stored procedure escape syntax are supported. SqlInfo_SQL_STORED_PROCEDURES_SUPPORTED SqlInfo = 537 + // // Retrieves the supported SQL subqueries. // // Returns an int32 bitmask value representing the supported SQL subqueries. @@ -355,12 +395,14 @@ const ( // - ... // Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlInfo_SQL_SUPPORTED_SUBQUERIES SqlInfo = 538 + // // Retrieves a boolean value indicating whether correlated subqueries are supported. // // Returns: // - false: if correlated subqueries are unsupported; // - true: if correlated subqueries are supported. SqlInfo_SQL_CORRELATED_SUBQUERIES_SUPPORTED SqlInfo = 539 + // // Retrieves the supported SQL UNIONs. // // Returns an int32 bitmask value representing the supported SQL UNIONs. @@ -393,6 +435,7 @@ const ( SqlInfo_SQL_MAX_CONNECTIONS SqlInfo = 549 // Retrieves a int64 value the maximum number of characters allowed in a cursor name. SqlInfo_SQL_MAX_CURSOR_NAME_LENGTH SqlInfo = 550 + // // Retrieves a int64 value representing the maximum number of bytes allowed for an index, // including all of the parts of the index. SqlInfo_SQL_MAX_INDEX_LENGTH SqlInfo = 551 @@ -404,15 +447,17 @@ const ( SqlInfo_SQL_MAX_CATALOG_NAME_LENGTH SqlInfo = 554 // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. SqlInfo_SQL_MAX_ROW_SIZE SqlInfo = 555 + // // Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL // data types LONGVARCHAR and LONGVARBINARY. // // Returns: - // - false: if return value for the JDBC method getMaxRowSize does - // not include the SQL data types LONGVARCHAR and LONGVARBINARY; - // - true: if return value for the JDBC method getMaxRowSize includes - // the SQL data types LONGVARCHAR and LONGVARBINARY. + // - false: if return value for the JDBC method getMaxRowSize does + // not include the SQL data types LONGVARCHAR and LONGVARBINARY; + // - true: if return value for the JDBC method getMaxRowSize includes + // the SQL data types LONGVARCHAR and LONGVARBINARY. SqlInfo_SQL_MAX_ROW_SIZE_INCLUDES_BLOBS SqlInfo = 556 + // // Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; // a result of 0 (zero) means that there is no limit or the limit is not known. SqlInfo_SQL_MAX_STATEMENT_LENGTH SqlInfo = 557 @@ -424,11 +469,13 @@ const ( SqlInfo_SQL_MAX_TABLES_IN_SELECT SqlInfo = 560 // Retrieves a int64 value representing the maximum number of characters allowed in a user name. SqlInfo_SQL_MAX_USERNAME_LENGTH SqlInfo = 561 + // // Retrieves this database's default transaction isolation level as described in // `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. // // Returns a int32 ordinal for the SQL transaction isolation level. SqlInfo_SQL_DEFAULT_TRANSACTION_ISOLATION SqlInfo = 562 + // // Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a // noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. // @@ -436,6 +483,7 @@ const ( // - false: if transactions are unsupported; // - true: if transactions are supported. SqlInfo_SQL_TRANSACTIONS_SUPPORTED SqlInfo = 563 + // // Retrieves the supported transactions isolation levels. // // Returns an int32 bitmask value representing the supported transactions isolation levels. @@ -462,6 +510,7 @@ const ( // - ... // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. SqlInfo_SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS SqlInfo = 564 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction forces // the transaction to commit. // @@ -469,12 +518,14 @@ const ( // - false: if a data definition statement within a transaction does not force the transaction to commit; // - true: if a data definition statement within a transaction forces the transaction to commit. SqlInfo_SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT SqlInfo = 565 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. // // Returns: // - false: if a data definition statement within a transaction is taken into account; // - true: a data definition statement within a transaction is ignored. SqlInfo_SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED SqlInfo = 566 + // // Retrieves an int32 bitmask value representing the supported result set types. // The returned bitmask should be parsed in order to retrieve the supported result set types. // @@ -491,6 +542,7 @@ const ( // - ... // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. SqlInfo_SQL_SUPPORTED_RESULT_SET_TYPES SqlInfo = 567 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. // @@ -505,6 +557,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED SqlInfo = 568 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. // @@ -519,6 +572,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY SqlInfo = 569 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. // @@ -533,6 +587,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE SqlInfo = 570 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. // @@ -547,29 +602,34 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE SqlInfo = 571 + // // Retrieves a boolean value indicating whether this database supports batch updates. // // - false: if this database does not support batch updates; // - true: if this database supports batch updates. SqlInfo_SQL_BATCH_UPDATES_SUPPORTED SqlInfo = 572 + // // Retrieves a boolean value indicating whether this database supports savepoints. // // Returns: // - false: if this database does not support savepoints; // - true: if this database supports savepoints. SqlInfo_SQL_SAVEPOINTS_SUPPORTED SqlInfo = 573 + // // Retrieves a boolean value indicating whether named parameters are supported in callable statements. // // Returns: // - false: if named parameters in callable statements are unsupported; // - true: if named parameters in callable statements are supported. SqlInfo_SQL_NAMED_PARAMETERS_SUPPORTED SqlInfo = 574 + // // Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. // // Returns: // - false: if updates made to a LOB are made directly to the LOB; // - true: if updates made to a LOB are made on a copy. SqlInfo_SQL_LOCATORS_UPDATE_COPY SqlInfo = 575 + // // Retrieves a boolean value indicating whether invoking user-defined or vendor functions // using the stored procedure escape syntax is supported. // @@ -1642,7 +1702,7 @@ func (SqlSupportsConvert) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{16} } -// * +//* // The JDBC/ODBC-defined type of any object. // All the values here are the sames as in the JDBC and ODBC specs. type XdbcDataType int32 @@ -1757,7 +1817,7 @@ func (XdbcDataType) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{17} } -// * +//* // Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. type XdbcDatetimeSubcode int32 @@ -1898,13 +1958,13 @@ func (XdbcDatetimeSubcode) EnumDescriptor() ([]byte, []int) { type Nullable int32 const ( - // * + //* // Indicates that the fields does not allow the use of null values. Nullable_NULLABILITY_NO_NULLS Nullable = 0 - // * + //* // Indicates that the fields allow the use of null values. Nullable_NULLABILITY_NULLABLE Nullable = 1 - // * + //* // Indicates that nullability of the fields can not be determined. Nullable_NULLABILITY_UNKNOWN Nullable = 2 ) @@ -1953,21 +2013,21 @@ func (Nullable) EnumDescriptor() ([]byte, []int) { type Searchable int32 const ( - // * + //* // Indicates that column can not be used in a WHERE clause. Searchable_SEARCHABLE_NONE Searchable = 0 - // * + //* // Indicates that the column can be used in a WHERE clause if it is using a // LIKE operator. Searchable_SEARCHABLE_CHAR Searchable = 1 - // * + //* // Indicates that the column can be used In a WHERE clause with any // operator other than LIKE. // - // - Allowed operators: comparison, quantified comparison, BETWEEN, - // DISTINCT, IN, MATCH, and UNIQUE. + // - Allowed operators: comparison, quantified comparison, BETWEEN, + // DISTINCT, IN, MATCH, and UNIQUE. Searchable_SEARCHABLE_BASIC Searchable = 2 - // * + //* // Indicates that the column can be used in a WHERE clause using any operator. Searchable_SEARCHABLE_FULL Searchable = 3 ) @@ -2233,23 +2293,22 @@ func (ActionCancelQueryResult_CancelResult) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{29, 0} } +// // Represents a metadata request. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the metadata request. // // The returned Arrow schema will be: // < -// -// info_name: uint32 not null, -// value: dense_union< -// string_value: utf8, -// bool_value: bool, -// bigint_value: int64, -// int32_bitmask: int32, -// string_list: list -// int32_to_int32_list_map: map> -// +// info_name: uint32 not null, +// value: dense_union< +// string_value: utf8, +// bool_value: bool, +// bigint_value: int64, +// int32_bitmask: int32, +// string_list: list +// int32_to_int32_list_map: map> // > // where there is one row per requested piece of metadata information. type CommandGetSqlInfo struct { @@ -2257,6 +2316,7 @@ type CommandGetSqlInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide // Flight SQL clients with basic, SQL syntax and SQL functions related information. // More information types can be added in future releases. @@ -2316,62 +2376,61 @@ func (x *CommandGetSqlInfo) GetInfo() []uint32 { return nil } +// // Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned schema will be: // < -// -// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), -// data_type: int32 not null (The SQL data type), -// column_size: int32 (The maximum size supported by that column. -// In case of exact numeric types, this represents the maximum precision. -// In case of string types, this represents the character length. -// In case of datetime data types, this represents the length in characters of the string representation. -// NULL is returned for data types where column size is not applicable.), -// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for -// data types where a literal prefix is not applicable.), -// literal_suffix: utf8 (Character or characters used to terminate a literal, -// NULL is returned for data types where a literal suffix is not applicable.), -// create_params: list -// (A list of keywords corresponding to which parameters can be used when creating -// a column for that specific type. -// NULL is returned if there are no parameters for the data type definition.), -// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the -// Nullable enum.), -// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), -// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the -// Searchable enum.), -// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is -// not applicable to the data type or the data type is not numeric.), -// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), -// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute -// is not applicable to the data type or the data type is not numeric.), -// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL -// is returned if a localized name is not supported by the data source), -// minimum_scale: int32 (The minimum scale of the data type on the data source. -// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE -// columns both contain this value. NULL is returned if scale is not applicable.), -// maximum_scale: int32 (The maximum scale of the data type on the data source. -// NULL is returned if scale is not applicable.), -// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values -// as data_type value. Except for interval and datetime, which -// uses generic values. More info about those types can be -// obtained through datetime_subcode. The possible values can be seen -// in the XdbcDataType enum.), -// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains -// its sub types. For type different from interval and datetime, this value -// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), -// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains -// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For -// exact numeric types, this column contains the value 10 to indicate that -// column size specifies a number of decimal digits. Otherwise, this column is NULL.), -// interval_precision: int32 (If the data type is an interval data type, then this column contains the value -// of the interval leading precision. Otherwise, this column is NULL. This fields -// is only relevant to be used by ODBC). -// +// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), +// data_type: int32 not null (The SQL data type), +// column_size: int32 (The maximum size supported by that column. +// In case of exact numeric types, this represents the maximum precision. +// In case of string types, this represents the character length. +// In case of datetime data types, this represents the length in characters of the string representation. +// NULL is returned for data types where column size is not applicable.), +// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for +// data types where a literal prefix is not applicable.), +// literal_suffix: utf8 (Character or characters used to terminate a literal, +// NULL is returned for data types where a literal suffix is not applicable.), +// create_params: list +// (A list of keywords corresponding to which parameters can be used when creating +// a column for that specific type. +// NULL is returned if there are no parameters for the data type definition.), +// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the +// Nullable enum.), +// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), +// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the +// Searchable enum.), +// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is +// not applicable to the data type or the data type is not numeric.), +// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), +// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute +// is not applicable to the data type or the data type is not numeric.), +// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL +// is returned if a localized name is not supported by the data source), +// minimum_scale: int32 (The minimum scale of the data type on the data source. +// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE +// columns both contain this value. NULL is returned if scale is not applicable.), +// maximum_scale: int32 (The maximum scale of the data type on the data source. +// NULL is returned if scale is not applicable.), +// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values +// as data_type value. Except for interval and datetime, which +// uses generic values. More info about those types can be +// obtained through datetime_subcode. The possible values can be seen +// in the XdbcDataType enum.), +// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains +// its sub types. For type different from interval and datetime, this value +// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), +// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains +// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For +// exact numeric types, this column contains the value 10 to indicate that +// column size specifies a number of decimal digits. Otherwise, this column is NULL.), +// interval_precision: int32 (If the data type is an interval data type, then this column contains the value +// of the interval leading precision. Otherwise, this column is NULL. This fields +// is only relevant to be used by ODBC). // > // The returned data should be ordered by data_type and then by type_name. type CommandGetXdbcTypeInfo struct { @@ -2379,6 +2438,7 @@ type CommandGetXdbcTypeInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the data type to search for the info. DataType *int32 `protobuf:"varint,1,opt,name=data_type,json=dataType,proto3,oneof" json:"data_type,omitempty"` } @@ -2422,17 +2482,16 @@ func (x *CommandGetXdbcTypeInfo) GetDataType() int32 { return 0 } +// // Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. // The definition of a catalog depends on vendor/implementation. It is usually the database itself // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8 not null -// +// catalog_name: utf8 not null // > // The returned data should be ordered by catalog_name. type CommandGetCatalogs struct { @@ -2473,18 +2532,17 @@ func (*CommandGetCatalogs) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{2} } +// // Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. // The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8 not null -// +// catalog_name: utf8, +// db_schema_name: utf8 not null // > // The returned data should be ordered by catalog_name, then db_schema_name. type CommandGetDbSchemas struct { @@ -2492,15 +2550,17 @@ type CommandGetDbSchemas struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` } @@ -2550,56 +2610,58 @@ func (x *CommandGetDbSchemas) GetDbSchemaFilterPattern() string { return "" } +// // Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// table_type: utf8 not null, -// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, -// it is serialized as an IPC message.) -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// table_type: utf8 not null, +// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, +// it is serialized as an IPC message.) // > // Fields on table_schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. type CommandGetTables struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` + // // Specifies a filter pattern for tables to search for. // When no table_name_filter_pattern is provided, all tables matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. TableNameFilterPattern *string `protobuf:"bytes,3,opt,name=table_name_filter_pattern,json=tableNameFilterPattern,proto3,oneof" json:"table_name_filter_pattern,omitempty"` + // // Specifies a filter of table types which must match. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. @@ -2675,18 +2737,17 @@ func (x *CommandGetTables) GetIncludeSchema() bool { return false } +// // Represents a request to retrieve the list of table types on a Flight SQL enabled backend. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// table_type: utf8 not null -// +// table_type: utf8 not null // > // The returned data should be ordered by table_type. type CommandGetTableTypes struct { @@ -2727,21 +2788,20 @@ func (*CommandGetTableTypes) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{5} } +// // Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// column_name: utf8 not null, -// key_name: utf8, -// key_sequence: int32 not null -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// column_name: utf8 not null, +// key_name: utf8, +// key_sequence: int32 not null // > // The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. type CommandGetPrimaryKeys struct { @@ -2749,10 +2809,12 @@ type CommandGetPrimaryKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2814,29 +2876,28 @@ func (x *CommandGetPrimaryKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns that reference the given table's // primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. @@ -2845,10 +2906,12 @@ type CommandGetExportedKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the foreign key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the foreign key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2910,45 +2973,46 @@ func (x *CommandGetExportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetImportedKeys struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the primary key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the primary key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -3010,67 +3074,66 @@ func (x *CommandGetImportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns in the given foreign key table that // reference the primary key or the columns representing a unique constraint of the parent table (could be the same // or a different table) on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetCrossReference struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - // * + //* // The catalog name where the parent table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. PkCatalog *string `protobuf:"bytes,1,opt,name=pk_catalog,json=pkCatalog,proto3,oneof" json:"pk_catalog,omitempty"` - // * + //* // The Schema name where the parent table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. PkDbSchema *string `protobuf:"bytes,2,opt,name=pk_db_schema,json=pkDbSchema,proto3,oneof" json:"pk_db_schema,omitempty"` - // * + //* // The parent table name. It cannot be null. PkTable string `protobuf:"bytes,3,opt,name=pk_table,json=pkTable,proto3" json:"pk_table,omitempty"` - // * + //* // The catalog name where the foreign table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. FkCatalog *string `protobuf:"bytes,4,opt,name=fk_catalog,json=fkCatalog,proto3,oneof" json:"fk_catalog,omitempty"` - // * + //* // The schema name where the foreign table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. FkDbSchema *string `protobuf:"bytes,5,opt,name=fk_db_schema,json=fkDbSchema,proto3,oneof" json:"fk_db_schema,omitempty"` - // * + //* // The foreign table name. It cannot be null. FkTable string `protobuf:"bytes,6,opt,name=fk_table,json=fkTable,proto3" json:"fk_table,omitempty"` } @@ -3149,6 +3212,7 @@ func (x *CommandGetCrossReference) GetFkTable() string { return "" } +// // Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. type ActionCreatePreparedStatementRequest struct { state protoimpl.MessageState @@ -3208,6 +3272,7 @@ func (x *ActionCreatePreparedStatementRequest) GetTransactionId() []byte { return nil } +// // An embedded message describing a Substrait plan to execute. type SubstraitPlan struct { state protoimpl.MessageState @@ -3271,6 +3336,7 @@ func (x *SubstraitPlan) GetVersion() string { return "" } +// // Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. type ActionCreatePreparedSubstraitPlanRequest struct { state protoimpl.MessageState @@ -3330,6 +3396,7 @@ func (x *ActionCreatePreparedSubstraitPlanRequest) GetTransactionId() []byte { return nil } +// // Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. // // The resultant PreparedStatement can be closed either: @@ -3405,6 +3472,7 @@ func (x *ActionCreatePreparedStatementResult) GetParameterSchema() []byte { return nil } +// // Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. // Closes server resources associated with the prepared statement handle. type ActionClosePreparedStatementRequest struct { @@ -3455,6 +3523,7 @@ func (x *ActionClosePreparedStatementRequest) GetPreparedStatementHandle() []byt return nil } +// // Request message for the "BeginTransaction" action. // Begins a transaction. type ActionBeginTransactionRequest struct { @@ -3495,6 +3564,7 @@ func (*ActionBeginTransactionRequest) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{15} } +// // Request message for the "BeginSavepoint" action. // Creates a savepoint within a transaction. // @@ -3557,6 +3627,7 @@ func (x *ActionBeginSavepointRequest) GetName() string { return "" } +// // The result of a "BeginTransaction" action. // // The transaction can be manipulated with the "EndTransaction" action, or @@ -3612,6 +3683,7 @@ func (x *ActionBeginTransactionResult) GetTransactionId() []byte { return nil } +// // The result of a "BeginSavepoint" action. // // The transaction can be manipulated with the "EndSavepoint" action. @@ -3667,6 +3739,7 @@ func (x *ActionBeginSavepointResult) GetSavepointId() []byte { return nil } +// // Request message for the "EndTransaction" action. // // Commit (COMMIT) or rollback (ROLLBACK) the transaction. @@ -3730,6 +3803,7 @@ func (x *ActionEndTransactionRequest) GetAction() ActionEndTransactionRequest_En return ActionEndTransactionRequest_END_TRANSACTION_UNSPECIFIED } +// // Request message for the "EndSavepoint" action. // // Release (RELEASE) the savepoint or rollback (ROLLBACK) to the @@ -3795,21 +3869,22 @@ func (x *ActionEndSavepointRequest) GetAction() ActionEndSavepointRequest_EndSav return ActionEndSavepointRequest_END_SAVEPOINT_UNSPECIFIED } +// // Represents a SQL query. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. type CommandStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3867,22 +3942,23 @@ func (x *CommandStatementQuery) GetTransactionId() []byte { return nil } +// // Represents a Substrait plan. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. -// - DoPut: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. +// - DoPut: execute the query. type CommandStatementSubstraitPlan struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3940,7 +4016,7 @@ func (x *CommandStatementSubstraitPlan) GetTransactionId() []byte { return nil } -// * +//* // Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. // This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. type TicketStatementQuery struct { @@ -3991,22 +4067,23 @@ func (x *TicketStatementQuery) GetStatementHandle() []byte { return nil } +// // Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for // the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. -// - GetFlightInfo: execute the prepared statement instance. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. +// - GetFlightInfo: execute the prepared statement instance. type CommandPreparedStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -4055,6 +4132,7 @@ func (x *CommandPreparedStatementQuery) GetPreparedStatementHandle() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included SQL update. type CommandStatementUpdate struct { @@ -4114,6 +4192,7 @@ func (x *CommandStatementUpdate) GetTransactionId() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included // prepared statement handle as an update. @@ -4165,6 +4244,7 @@ func (x *CommandPreparedStatementUpdate) GetPreparedStatementHandle() []byte { return nil } +// // Returned from the RPC call DoPut when a CommandStatementUpdate // CommandPreparedStatementUpdate was in the request, containing // results from the update. @@ -4217,6 +4297,7 @@ func (x *DoPutUpdateResult) GetRecordCount() int64 { return 0 } +// // Request message for the "CancelQuery" action. // // Explicitly cancel a running query. @@ -4285,6 +4366,7 @@ func (x *ActionCancelQueryRequest) GetInfo() []byte { return nil } +// // The result of cancelling a query. // // The result should be wrapped in a google.protobuf.Any message. diff --git a/go/arrow/flight/gen/flight/Flight_grpc.pb.go b/go/arrow/flight/gen/flight/Flight_grpc.pb.go index 150de71a1113b..87d9abc5926eb 100644 --- a/go/arrow/flight/gen/flight/Flight_grpc.pb.go +++ b/go/arrow/flight/gen/flight/Flight_grpc.pb.go @@ -1,4 +1,8 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v4.23.4 +// source: Flight.proto package flight @@ -11,17 +15,20 @@ import ( // This is a compile-time assertion to ensure that this generated file // is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. const _ = grpc.SupportPackageIsVersion7 // FlightServiceClient is the client API for FlightService service. // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type FlightServiceClient interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -29,6 +36,7 @@ type FlightServiceClient interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -40,6 +48,7 @@ type FlightServiceClient interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*FlightInfo, error) + // // For a given FlightDescriptor, start a query and get information // to poll its execution status. This is a useful interface if the // query may be a long-running query. The first PollFlightInfo call @@ -63,16 +72,19 @@ type FlightServiceClient interface { // A client may use the CancelFlightInfo action with // PollInfo.info to cancel the running query. PollFlightInfo(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*PollInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -80,12 +92,14 @@ type FlightServiceClient interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -93,6 +107,7 @@ type FlightServiceClient interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -108,7 +123,7 @@ func NewFlightServiceClient(cc grpc.ClientConnInterface) FlightServiceClient { } func (c *flightServiceClient) Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) if err != nil { return nil, err } @@ -139,7 +154,7 @@ func (x *flightServiceHandshakeClient) Recv() (*HandshakeResponse, error) { } func (c *flightServiceClient) ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) if err != nil { return nil, err } @@ -198,7 +213,7 @@ func (c *flightServiceClient) GetSchema(ctx context.Context, in *FlightDescripto } func (c *flightServiceClient) DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) if err != nil { return nil, err } @@ -230,7 +245,7 @@ func (x *flightServiceDoGetClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) if err != nil { return nil, err } @@ -261,7 +276,7 @@ func (x *flightServiceDoPutClient) Recv() (*PutResult, error) { } func (c *flightServiceClient) DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) if err != nil { return nil, err } @@ -292,7 +307,7 @@ func (x *flightServiceDoExchangeClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) if err != nil { return nil, err } @@ -324,7 +339,7 @@ func (x *flightServiceDoActionClient) Recv() (*Result, error) { } func (c *flightServiceClient) ListActions(ctx context.Context, in *Empty, opts ...grpc.CallOption) (FlightService_ListActionsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) if err != nil { return nil, err } @@ -359,11 +374,13 @@ func (x *flightServiceListActionsClient) Recv() (*ActionType, error) { // All implementations must embed UnimplementedFlightServiceServer // for forward compatibility type FlightServiceServer interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(FlightService_HandshakeServer) error + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -371,6 +388,7 @@ type FlightServiceServer interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(*Criteria, FlightService_ListFlightsServer) error + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -382,6 +400,7 @@ type FlightServiceServer interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(context.Context, *FlightDescriptor) (*FlightInfo, error) + // // For a given FlightDescriptor, start a query and get information // to poll its execution status. This is a useful interface if the // query may be a long-running query. The first PollFlightInfo call @@ -405,16 +424,19 @@ type FlightServiceServer interface { // A client may use the CancelFlightInfo action with // PollInfo.info to cancel the running query. PollFlightInfo(context.Context, *FlightDescriptor) (*PollInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(context.Context, *FlightDescriptor) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(*Ticket, FlightService_DoGetServer) error + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -422,12 +444,14 @@ type FlightServiceServer interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(FlightService_DoPutServer) error + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(FlightService_DoExchangeServer) error + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -435,6 +459,7 @@ type FlightServiceServer interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(*Action, FlightService_DoActionServer) error + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -485,8 +510,8 @@ type UnsafeFlightServiceServer interface { mustEmbedUnimplementedFlightServiceServer() } -func RegisterFlightServiceServer(s *grpc.Server, srv FlightServiceServer) { - s.RegisterService(&_FlightService_serviceDesc, srv) +func RegisterFlightServiceServer(s grpc.ServiceRegistrar, srv FlightServiceServer) { + s.RegisterService(&FlightService_ServiceDesc, srv) } func _FlightService_Handshake_Handler(srv interface{}, stream grpc.ServerStream) error { @@ -705,7 +730,10 @@ func (x *flightServiceListActionsServer) Send(m *ActionType) error { return x.ServerStream.SendMsg(m) } -var _FlightService_serviceDesc = grpc.ServiceDesc{ +// FlightService_ServiceDesc is the grpc.ServiceDesc for FlightService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var FlightService_ServiceDesc = grpc.ServiceDesc{ ServiceName: "arrow.flight.protocol.FlightService", HandlerType: (*FlightServiceServer)(nil), Methods: []grpc.MethodDesc{ diff --git a/go/arrow/internal/arrjson/reader.go b/go/arrow/internal/arrjson/reader.go index 34b9b6e10ec4a..c8056ef1dc744 100644 --- a/go/arrow/internal/arrjson/reader.go +++ b/go/arrow/internal/arrjson/reader.go @@ -82,6 +82,8 @@ func (r *Reader) Release() { r.recs[i] = nil } } + r.memo.Clear() + r.memo = nil } } func (r *Reader) Schema() *arrow.Schema { return r.schema } @@ -96,6 +98,14 @@ func (r *Reader) Read() (arrow.Record, error) { return rec, nil } +func (r *Reader) ReadAt(index int) (arrow.Record, error) { + if index >= r.NumRecords() { + return nil, io.EOF + } + rec := r.recs[index] + return rec, nil +} + var ( _ arrio.Reader = (*Reader)(nil) ) diff --git a/go/arrow/internal/cdata_integration/entrypoints.go b/go/arrow/internal/cdata_integration/entrypoints.go new file mode 100644 index 0000000000000..629b8a762a689 --- /dev/null +++ b/go/arrow/internal/cdata_integration/entrypoints.go @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build cdata_integration +// +build cdata_integration + +package main + +import ( + "fmt" + "os" + "runtime" + "unsafe" + + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/arrow/cdata" + "github.com/apache/arrow/go/v14/arrow/internal/arrjson" + "github.com/apache/arrow/go/v14/arrow/memory" +) + +// #include +// #include +import "C" + +var alloc = memory.NewCheckedAllocator(memory.NewGoAllocator()) + +//export ArrowGo_BytesAllocated +func ArrowGo_BytesAllocated() int64 { + return int64(alloc.CurrentAlloc()) +} + +//export ArrowGo_RunGC +func ArrowGo_RunGC() { + runtime.GC() +} + +//export ArrowGo_FreeError +func ArrowGo_FreeError(cError *C.char) { + C.free(unsafe.Pointer(cError)) +} + +// When used in a defer() statement, this functions catches an incoming +// panic and converts it into a regular error. This avoids crashing the +// archery integration process and lets other tests proceed. +// Not all panics may be caught and some will still crash the process, though. +func handlePanic(err *error) { + if e := recover(); e != nil { + // Add a prefix while wrapping the panic-error + *err = fmt.Errorf("panic: %w", e.(error)) + } +} + +func newJsonReader(cJsonPath *C.char) (*arrjson.Reader, error) { + jsonPath := C.GoString(cJsonPath) + + f, err := os.Open(jsonPath) + if err != nil { + return nil, fmt.Errorf("could not open JSON file %q: %w", jsonPath, err) + } + defer f.Close() + + jsonReader, err := arrjson.NewReader(f, arrjson.WithAllocator(alloc)) + if err != nil { + return nil, fmt.Errorf("could not open JSON file reader from file %q: %w", jsonPath, err) + } + return jsonReader, nil +} + +func exportSchemaFromJson(cJsonPath *C.char, out *cdata.CArrowSchema) error { + jsonReader, err := newJsonReader(cJsonPath) + if err != nil { + return err + } + defer jsonReader.Release() + schema := jsonReader.Schema() + defer handlePanic(&err) + cdata.ExportArrowSchema(schema, out) + return err +} + +func importSchemaAndCompareToJson(cJsonPath *C.char, cSchema *cdata.CArrowSchema) error { + jsonReader, err := newJsonReader(cJsonPath) + if err != nil { + return err + } + defer jsonReader.Release() + schema := jsonReader.Schema() + importedSchema, err := cdata.ImportCArrowSchema(cSchema) + if err != nil { + return err + } + if !schema.Equal(importedSchema) || !schema.Metadata().Equal(importedSchema.Metadata()) { + return fmt.Errorf( + "Schemas are different:\n- Json Schema: %s\n- Imported Schema: %s", + schema.String(), + importedSchema.String()) + } + return nil +} + +func exportBatchFromJson(cJsonPath *C.char, num_batch int, out *cdata.CArrowArray) error { + // XXX this function exports a single batch at a time, but the JSON reader + // reads all batches at construction. + jsonReader, err := newJsonReader(cJsonPath) + if err != nil { + return err + } + defer jsonReader.Release() + batch, err := jsonReader.ReadAt(num_batch) + if err != nil { + return err + } + defer handlePanic(&err) + cdata.ExportArrowRecordBatch(batch, out, nil) + return err +} + +func importBatchAndCompareToJson(cJsonPath *C.char, num_batch int, cArray *cdata.CArrowArray) error { + jsonReader, err := newJsonReader(cJsonPath) + if err != nil { + return err + } + defer jsonReader.Release() + schema := jsonReader.Schema() + batch, err := jsonReader.ReadAt(num_batch) + if err != nil { + return err + } + + importedBatch, err := cdata.ImportCRecordBatchWithSchema(cArray, schema) + if err != nil { + return err + } + defer importedBatch.Release() + if !array.RecordEqual(batch, importedBatch) { + return fmt.Errorf( + "Batches are different:\n- Json Batch: %v\n- Imported Batch: %v", + batch, importedBatch) + } + return nil +} + +//export ArrowGo_ExportSchemaFromJson +func ArrowGo_ExportSchemaFromJson(cJsonPath *C.char, out uintptr) *C.char { + err := exportSchemaFromJson(cJsonPath, cdata.SchemaFromPtr(out)) + if err != nil { + return C.CString(err.Error()) + } + return nil +} + +//export ArrowGo_ExportBatchFromJson +func ArrowGo_ExportBatchFromJson(cJsonPath *C.char, num_batch int, out uintptr) *C.char { + err := exportBatchFromJson(cJsonPath, num_batch, cdata.ArrayFromPtr(out)) + if err != nil { + return C.CString(err.Error()) + } + return nil +} + +//export ArrowGo_ImportSchemaAndCompareToJson +func ArrowGo_ImportSchemaAndCompareToJson(cJsonPath *C.char, cSchema uintptr) *C.char { + err := importSchemaAndCompareToJson(cJsonPath, cdata.SchemaFromPtr(cSchema)) + if err != nil { + return C.CString(err.Error()) + } + return nil +} + +//export ArrowGo_ImportBatchAndCompareToJson +func ArrowGo_ImportBatchAndCompareToJson(cJsonPath *C.char, num_batch int, cArray uintptr) *C.char { + err := importBatchAndCompareToJson(cJsonPath, num_batch, cdata.ArrayFromPtr(cArray)) + if err != nil { + return C.CString(err.Error()) + } + return nil +} + +func main() {} diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go index 4108cf8124245..4f47d7fd506ee 100644 --- a/go/arrow/internal/flight_integration/scenario.go +++ b/go/arrow/internal/flight_integration/scenario.go @@ -71,6 +71,8 @@ func GetScenario(name string, args ...string) Scenario { return &expirationTimeRenewFlightEndpointScenarioTester{} case "poll_flight_info": return &pollFlightInfoScenarioTester{} + case "app_metadata_flight_info_endpoint": + return &appMetadataFlightInfoEndpointScenarioTester{} case "flight_sql": return &flightSqlScenarioTester{} case "flight_sql:extension": @@ -1153,7 +1155,7 @@ func (tester *pollFlightInfoScenarioTester) PollFlightInfo(ctx context.Context, nil, ) endpoints := []*flight.FlightEndpoint{ - &flight.FlightEndpoint{ + { Ticket: &flight.Ticket{Ticket: []byte("long-running query")}, Location: []*flight.Location{}, }, @@ -1236,6 +1238,66 @@ func (tester *pollFlightInfoScenarioTester) RunClient(addr string, opts ...grpc. return nil } +type appMetadataFlightInfoEndpointScenarioTester struct { + flight.BaseFlightServer +} + +func (tester *appMetadataFlightInfoEndpointScenarioTester) MakeServer(port int) flight.Server { + srv := flight.NewServerWithMiddleware(nil) + srv.RegisterFlightService(tester) + initServer(port, srv) + return srv +} + +func (tester *appMetadataFlightInfoEndpointScenarioTester) GetFlightInfo(ctx context.Context, desc *flight.FlightDescriptor) (*flight.FlightInfo, error) { + schema := arrow.NewSchema( + []arrow.Field{ + {Name: "number", Type: arrow.PrimitiveTypes.Uint32}, + }, + nil, + ) + + if desc.Type != flight.DescriptorCMD { + return nil, fmt.Errorf("%w: should have received CMD descriptor", arrow.ErrInvalid) + } + endpoints := []*flight.FlightEndpoint{{AppMetadata: desc.Cmd}} + return &flight.FlightInfo{ + Schema: flight.SerializeSchema(schema, memory.DefaultAllocator), + FlightDescriptor: desc, + Endpoint: endpoints, + TotalRecords: -1, + TotalBytes: -1, + AppMetadata: desc.Cmd, + }, nil +} + +func (tester *appMetadataFlightInfoEndpointScenarioTester) RunClient(addr string, opts ...grpc.DialOption) error { + client, err := flight.NewClientWithMiddleware(addr, nil, nil, opts...) + if err != nil { + return err + } + defer client.Close() + + ctx := context.Background() + desc := flight.FlightDescriptor{ + Type: flight.DescriptorCMD, + Cmd: []byte("foobar"), + } + info, err := client.GetFlightInfo(ctx, &desc) + if err != nil { + return err + } + switch { + case !bytes.Equal(desc.Cmd, info.AppMetadata): + return fmt.Errorf("invalid flight info app_metadata: %s, expected: %s", info.AppMetadata, desc.Cmd) + case len(info.Endpoint) != 1: + return fmt.Errorf("expected exactly 1 flight endpoint, got: %d", len(info.Endpoint)) + case !bytes.Equal(desc.Cmd, info.Endpoint[0].AppMetadata): + return fmt.Errorf("invalid flight endpoint app_metadata: %s, expected: %s", info.Endpoint[0].AppMetadata, desc.Cmd) + } + return nil +} + const ( updateStatementExpectedRows int64 = 10000 updateStatementWithTransactionExpectedRows int64 = 15000 diff --git a/go/parquet/pqarrow/file_reader.go b/go/parquet/pqarrow/file_reader.go index d54e365b55e0c..d91010c62c19d 100755 --- a/go/parquet/pqarrow/file_reader.go +++ b/go/parquet/pqarrow/file_reader.go @@ -394,8 +394,8 @@ func (fr *FileReader) ReadRowGroups(ctx context.Context, indices, rowGroups []in } func (fr *FileReader) getColumnReader(ctx context.Context, i int, colFactory itrFactory) (*ColumnReader, error) { - if i < 0 || i >= fr.rdr.MetaData().Schema.NumColumns() { - return nil, fmt.Errorf("invalid column index chosen %d, there are only %d columns", i, fr.rdr.MetaData().Schema.NumColumns()) + if i < 0 || i >= len(fr.Manifest.Fields) { + return nil, fmt.Errorf("invalid column index chosen %d, there are only %d columns", i, len(fr.Manifest.Fields)) } ctx = context.WithValue(ctx, rdrCtxKey{}, readerCtx{ diff --git a/go/parquet/pqarrow/file_reader_test.go b/go/parquet/pqarrow/file_reader_test.go index 2b4aa8ab78dbe..d1f3ae1c984a2 100644 --- a/go/parquet/pqarrow/file_reader_test.go +++ b/go/parquet/pqarrow/file_reader_test.go @@ -19,9 +19,11 @@ package pqarrow_test import ( "bytes" "context" + "fmt" "io" "os" "path/filepath" + "strings" "testing" "github.com/apache/arrow/go/v14/arrow" @@ -216,3 +218,68 @@ func TestFileReaderWriterMetadata(t *testing.T) { assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys()) assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values()) } + +func TestFileReaderColumnChunkBoundsErrors(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "zero", Type: arrow.PrimitiveTypes.Float64}, + {Name: "g", Type: arrow.StructOf( + arrow.Field{Name: "one", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "two", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "three", Type: arrow.PrimitiveTypes.Float64}, + )}, + }, nil) + + // generate Parquet data with four columns + // that are represented by two logical fields + data := `[ + { + "zero": 1, + "g": { + "one": 1, + "two": 1, + "three": 1 + } + }, + { + "zero": 2, + "g": { + "one": 2, + "two": 2, + "three": 2 + } + } + ]` + + record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) + require.NoError(t, err) + + output := &bytes.Buffer{} + writer, err := pqarrow.NewFileWriter(schema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.Write(record)) + require.NoError(t, writer.Close()) + + fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes())) + require.NoError(t, err) + + arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator) + require.NoError(t, err) + + // assert that errors are returned for indexes outside the bounds of the logical fields (instead of the physical columns) + ctx := pqarrow.NewArrowWriteContext(context.Background(), nil) + assert.Greater(t, fileReader.NumRowGroups(), 0) + for rowGroupIndex := 0; rowGroupIndex < fileReader.NumRowGroups(); rowGroupIndex += 1 { + rowGroupReader := arrowReader.RowGroup(rowGroupIndex) + for fieldNum := 0; fieldNum < schema.NumFields(); fieldNum += 1 { + _, err := rowGroupReader.Column(fieldNum).Read(ctx) + assert.NoError(t, err, "reading field num: %d", fieldNum) + } + + _, subZeroErr := rowGroupReader.Column(-1).Read(ctx) + assert.Error(t, subZeroErr) + + _, tooHighErr := rowGroupReader.Column(schema.NumFields()).Read(ctx) + assert.ErrorContains(t, tooHighErr, fmt.Sprintf("there are only %d columns", schema.NumFields())) + } +} diff --git a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcReader.java b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcReader.java index b42ddb48433b5..648e17e9c374c 100644 --- a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcReader.java +++ b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcReader.java @@ -84,7 +84,7 @@ public int getNumberOfStripes() throws IllegalArgumentException { } @Override - public void close() throws Exception { + public void close() { jniWrapper.close(nativeInstanceId); } } diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java index 99d66f94261ee..4f9c8b7d71bab 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java @@ -25,7 +25,6 @@ import org.apache.arrow.memory.util.ArrowBufPointer; import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.vector.BaseFixedWidthVector; -import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -50,6 +49,7 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; /** @@ -112,7 +112,7 @@ public static VectorValueComparator createDefaultComp } else if (vector instanceof TimeStampVector) { return (VectorValueComparator) new TimeStampComparator(); } - } else if (vector instanceof BaseVariableWidthVector) { + } else if (vector instanceof VariableWidthVector) { return (VectorValueComparator) new VariableWidthComparator(); } else if (vector instanceof BaseRepeatedValueVector) { VectorValueComparator innerComparator = @@ -675,14 +675,14 @@ public VectorValueComparator createNew() { } /** - * Default comparator for {@link org.apache.arrow.vector.BaseVariableWidthVector}. + * Default comparator for {@link org.apache.arrow.vector.VariableWidthVector}. * The comparison is in lexicographic order, with null comes first. */ - public static class VariableWidthComparator extends VectorValueComparator { + public static class VariableWidthComparator extends VectorValueComparator { - private ArrowBufPointer reusablePointer1 = new ArrowBufPointer(); + private final ArrowBufPointer reusablePointer1 = new ArrowBufPointer(); - private ArrowBufPointer reusablePointer2 = new ArrowBufPointer(); + private final ArrowBufPointer reusablePointer2 = new ArrowBufPointer(); @Override public int compare(int index1, int index2) { @@ -699,7 +699,7 @@ public int compareNotNull(int index1, int index2) { } @Override - public VectorValueComparator createNew() { + public VectorValueComparator createNew() { return new VariableWidthComparator(); } } @@ -743,7 +743,7 @@ public int compareNotNull(int index1, int index2) { @Override public VectorValueComparator createNew() { VectorValueComparator newInnerComparator = innerComparator.createNew(); - return new RepeatedValueComparator(newInnerComparator); + return new RepeatedValueComparator<>(newInnerComparator); } @Override diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java index 62051197740d8..bdae85110aa62 100644 --- a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java +++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java @@ -35,6 +35,8 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; @@ -47,6 +49,9 @@ import org.apache.arrow.vector.UInt2Vector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.TimeUnit; @@ -911,4 +916,25 @@ public void testCheckNullsOnCompareIsTrueWithEmptyVectors() { assertTrue(comparator.checkNullsOnCompare()); } } + + @Test + public void testVariableWidthDefaultComparators() { + try (VarCharVector vec = new VarCharVector("test", allocator)) { + verifyVariableWidthComparatorReturned(vec); + } + try (VarBinaryVector vec = new VarBinaryVector("test", allocator)) { + verifyVariableWidthComparatorReturned(vec); + } + try (LargeVarCharVector vec = new LargeVarCharVector("test", allocator)) { + verifyVariableWidthComparatorReturned(vec); + } + try (LargeVarBinaryVector vec = new LargeVarBinaryVector("test", allocator)) { + verifyVariableWidthComparatorReturned(vec); + } + } + + private static void verifyVariableWidthComparatorReturned(V vec) { + VectorValueComparator comparator = DefaultVectorComparators.createDefaultComparator(vec); + assertEquals(DefaultVectorComparators.VariableWidthComparator.class, comparator.getClass()); + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ClientAuthHandler.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ClientAuthHandler.java index 985e10aa4dd4b..af7da86e009e6 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ClientAuthHandler.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ClientAuthHandler.java @@ -19,9 +19,16 @@ import java.util.Iterator; +import org.apache.arrow.flight.FlightClient; + /** * Implement authentication for Flight on the client side. + * + * @deprecated As of 14.0.0. This implements a stateful "login" flow that does not play well with + * distributed or stateless systems. It will not be removed, but should not be used. Instead + * see {@link FlightClient#authenticateBasicToken(String, String)}. */ +@Deprecated public interface ClientAuthHandler { /** * Handle the initial handshake with the server. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthHandler.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthHandler.java index 3a978b131f26c..378027c9287fe 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthHandler.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthHandler.java @@ -20,9 +20,18 @@ import java.util.Iterator; import java.util.Optional; +import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.auth2.CallHeaderAuthenticator; + /** * Interface for Server side authentication handlers. + * + * @deprecated As of 14.0.0. This implements a stateful "login" flow that does not play well with + * distributed or stateless systems. It will not be removed, but should not be used. Instead, + * see {@link FlightServer.Builder#headerAuthenticator(CallHeaderAuthenticator)} + * and {@link CallHeaderAuthenticator}. */ +@Deprecated public interface ServerAuthHandler { /** diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/BasicFlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/BasicFlightSqlProducer.java new file mode 100644 index 0000000000000..ea99191f28e13 --- /dev/null +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/BasicFlightSqlProducer.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql; + +import java.util.List; + +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightEndpoint; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.sql.impl.FlightSql; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.google.protobuf.Message; + +/** + * A {@link FlightSqlProducer} that implements getting FlightInfo for each metadata request. + */ +public abstract class BasicFlightSqlProducer extends NoOpFlightSqlProducer { + + @Override + public FlightInfo getFlightInfoSqlInfo(FlightSql.CommandGetSqlInfo request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_SQL_INFO_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoTypeInfo(FlightSql.CommandGetXdbcTypeInfo request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_TYPE_INFO_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoCatalogs(FlightSql.CommandGetCatalogs request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_CATALOGS_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoSchemas(FlightSql.CommandGetDbSchemas request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_SCHEMAS_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoTables(FlightSql.CommandGetTables request, CallContext context, + FlightDescriptor descriptor) { + if (request.getIncludeSchema()) { + return generateFlightInfo(request, descriptor, Schemas.GET_TABLES_SCHEMA); + } + return generateFlightInfo(request, descriptor, Schemas.GET_TABLES_SCHEMA_NO_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoTableTypes(FlightSql.CommandGetTableTypes request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_TABLE_TYPES_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoPrimaryKeys(FlightSql.CommandGetPrimaryKeys request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_PRIMARY_KEYS_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoExportedKeys(FlightSql.CommandGetExportedKeys request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_EXPORTED_KEYS_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoImportedKeys(FlightSql.CommandGetImportedKeys request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_IMPORTED_KEYS_SCHEMA); + } + + @Override + public FlightInfo getFlightInfoCrossReference(FlightSql.CommandGetCrossReference request, CallContext context, + FlightDescriptor descriptor) { + return generateFlightInfo(request, descriptor, Schemas.GET_CROSS_REFERENCE_SCHEMA); + } + + /** + * Return a list of FlightEndpoints for the given request and FlightDescriptor. This method should validate that + * the request is supported by this FlightSqlProducer. + */ + protected abstract + List determineEndpoints(T request, FlightDescriptor flightDescriptor, Schema schema); + + protected FlightInfo generateFlightInfo(T request, FlightDescriptor descriptor, Schema schema) { + final List endpoints = determineEndpoints(request, descriptor, schema); + return new FlightInfo(schema, descriptor, endpoints, -1, -1); + } +} diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java new file mode 100644 index 0000000000000..a02cee64bd855 --- /dev/null +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql; + +import org.apache.arrow.flight.CallStatus; +import org.apache.arrow.flight.Criteria; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.PutResult; +import org.apache.arrow.flight.Result; +import org.apache.arrow.flight.SchemaResult; +import org.apache.arrow.flight.sql.impl.FlightSql; + +/** + * A {@link FlightSqlProducer} that throws on all FlightSql-specific operations. + */ +public class NoOpFlightSqlProducer implements FlightSqlProducer { + @Override + public void createPreparedStatement(FlightSql.ActionCreatePreparedStatementRequest request, + CallContext context, StreamListener listener) { + listener.onError(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public void closePreparedStatement(FlightSql.ActionClosePreparedStatementRequest request, + CallContext context, StreamListener listener) { + listener.onError(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoStatement(FlightSql.CommandStatementQuery command, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public FlightInfo getFlightInfoPreparedStatement(FlightSql.CommandPreparedStatementQuery command, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public SchemaResult getSchemaStatement(FlightSql.CommandStatementQuery command, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamStatement(FlightSql.TicketStatementQuery ticket, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public void getStreamPreparedStatement(FlightSql.CommandPreparedStatementQuery command, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public Runnable acceptPutStatement(FlightSql.CommandStatementUpdate command, CallContext context, + FlightStream flightStream, StreamListener ackStream) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public Runnable acceptPutPreparedStatementUpdate(FlightSql.CommandPreparedStatementUpdate command, + CallContext context, FlightStream flightStream, + StreamListener ackStream) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public Runnable acceptPutPreparedStatementQuery(FlightSql.CommandPreparedStatementQuery command, CallContext context, + FlightStream flightStream, StreamListener ackStream) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public FlightInfo getFlightInfoSqlInfo(FlightSql.CommandGetSqlInfo request, CallContext context, + FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamSqlInfo(FlightSql.CommandGetSqlInfo command, CallContext context, + ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoTypeInfo(FlightSql.CommandGetXdbcTypeInfo request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamTypeInfo(FlightSql.CommandGetXdbcTypeInfo request, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoCatalogs(FlightSql.CommandGetCatalogs request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamCatalogs(CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoSchemas(FlightSql.CommandGetDbSchemas request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamSchemas(FlightSql.CommandGetDbSchemas command, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoTables(FlightSql.CommandGetTables request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamTables(FlightSql.CommandGetTables command, CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoTableTypes(FlightSql.CommandGetTableTypes request, CallContext context, + FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamTableTypes(CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoPrimaryKeys(FlightSql.CommandGetPrimaryKeys request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamPrimaryKeys(FlightSql.CommandGetPrimaryKeys command, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public FlightInfo getFlightInfoExportedKeys(FlightSql.CommandGetExportedKeys request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public FlightInfo getFlightInfoImportedKeys(FlightSql.CommandGetImportedKeys request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public FlightInfo getFlightInfoCrossReference(FlightSql.CommandGetCrossReference request, + CallContext context, FlightDescriptor descriptor) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public void getStreamExportedKeys(FlightSql.CommandGetExportedKeys command, + CallContext context, ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public void getStreamImportedKeys(FlightSql.CommandGetImportedKeys command, CallContext context, + ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public void getStreamCrossReference(FlightSql.CommandGetCrossReference command, CallContext context, + ServerStreamListener listener) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + @Override + public void close() throws Exception { + + } + + @Override + public void listFlights(CallContext context, Criteria criteria, StreamListener listener) { + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } +} diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSql.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSql.java index 6da915a8ffb14..7635b80ecd0fd 100644 --- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSql.java +++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSql.java @@ -20,7 +20,7 @@ import static java.util.Arrays.asList; import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; -import static java.util.Objects.isNull; +import static org.apache.arrow.flight.sql.util.FlightStreamUtils.getResults; import static org.apache.arrow.util.AutoCloseables.close; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.is; @@ -29,16 +29,12 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.channels.Channels; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; import java.util.stream.IntStream; @@ -52,18 +48,9 @@ import org.apache.arrow.flight.sql.util.TableRef; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.UInt1Vector; -import org.apache.arrow.vector.UInt4Vector; -import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.complex.DenseUnionVector; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.ipc.ReadChannel; -import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -657,197 +644,202 @@ public void testGetSqlInfoResultsWithThreeArgs() throws Exception { } @Test - public void testGetCommandExportedKeys() { - final FlightStream stream = + public void testGetCommandExportedKeys() throws Exception { + try (final FlightStream stream = sqlClient.getStream( sqlClient.getExportedKeys(TableRef.of(null, null, "FOREIGNTABLE")) - .getEndpoints().get(0).getTicket()); - - final List> results = getResults(stream); - - final List> matchers = asList( - nullValue(String.class), // pk_catalog_name - is("APP"), // pk_schema_name - is("FOREIGNTABLE"), // pk_table_name - is("ID"), // pk_column_name - nullValue(String.class), // fk_catalog_name - is("APP"), // fk_schema_name - is("INTTABLE"), // fk_table_name - is("FOREIGNID"), // fk_column_name - is("1"), // key_sequence - containsString("SQL"), // fk_key_name - containsString("SQL"), // pk_key_name - is("3"), // update_rule - is("3")); // delete_rule - - final List assertions = new ArrayList<>(); - Assertions.assertEquals(1, results.size()); - for (int i = 0; i < matchers.size(); i++) { - final String actual = results.get(0).get(i); - final Matcher expected = matchers.get(i); - assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + .getEndpoints().get(0).getTicket())) { + + final List> results = getResults(stream); + + final List> matchers = asList( + nullValue(String.class), // pk_catalog_name + is("APP"), // pk_schema_name + is("FOREIGNTABLE"), // pk_table_name + is("ID"), // pk_column_name + nullValue(String.class), // fk_catalog_name + is("APP"), // fk_schema_name + is("INTTABLE"), // fk_table_name + is("FOREIGNID"), // fk_column_name + is("1"), // key_sequence + containsString("SQL"), // fk_key_name + containsString("SQL"), // pk_key_name + is("3"), // update_rule + is("3")); // delete_rule + + final List assertions = new ArrayList<>(); + Assertions.assertEquals(1, results.size()); + for (int i = 0; i < matchers.size(); i++) { + final String actual = results.get(0).get(i); + final Matcher expected = matchers.get(i); + assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + } + Assertions.assertAll(assertions); } - Assertions.assertAll(assertions); } @Test - public void testGetCommandImportedKeys() { - final FlightStream stream = + public void testGetCommandImportedKeys() throws Exception { + try (final FlightStream stream = sqlClient.getStream( sqlClient.getImportedKeys(TableRef.of(null, null, "INTTABLE")) - .getEndpoints().get(0).getTicket()); - - final List> results = getResults(stream); - - final List> matchers = asList( - nullValue(String.class), // pk_catalog_name - is("APP"), // pk_schema_name - is("FOREIGNTABLE"), // pk_table_name - is("ID"), // pk_column_name - nullValue(String.class), // fk_catalog_name - is("APP"), // fk_schema_name - is("INTTABLE"), // fk_table_name - is("FOREIGNID"), // fk_column_name - is("1"), // key_sequence - containsString("SQL"), // fk_key_name - containsString("SQL"), // pk_key_name - is("3"), // update_rule - is("3")); // delete_rule - - Assertions.assertEquals(1, results.size()); - final List assertions = new ArrayList<>(); - for (int i = 0; i < matchers.size(); i++) { - final String actual = results.get(0).get(i); - final Matcher expected = matchers.get(i); - assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + .getEndpoints().get(0).getTicket())) { + + final List> results = getResults(stream); + + final List> matchers = asList( + nullValue(String.class), // pk_catalog_name + is("APP"), // pk_schema_name + is("FOREIGNTABLE"), // pk_table_name + is("ID"), // pk_column_name + nullValue(String.class), // fk_catalog_name + is("APP"), // fk_schema_name + is("INTTABLE"), // fk_table_name + is("FOREIGNID"), // fk_column_name + is("1"), // key_sequence + containsString("SQL"), // fk_key_name + containsString("SQL"), // pk_key_name + is("3"), // update_rule + is("3")); // delete_rule + + Assertions.assertEquals(1, results.size()); + final List assertions = new ArrayList<>(); + for (int i = 0; i < matchers.size(); i++) { + final String actual = results.get(0).get(i); + final Matcher expected = matchers.get(i); + assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + } + Assertions.assertAll(assertions); } - Assertions.assertAll(assertions); } @Test - public void testGetTypeInfo() { + public void testGetTypeInfo() throws Exception { FlightInfo flightInfo = sqlClient.getXdbcTypeInfo(); - FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket()); - - final List> results = getResults(stream); - - final List> matchers = ImmutableList.of( - asList("BIGINT", "-5", "19", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", - "BIGINT", "0", "0", - null, null, "10", null), - asList("LONG VARCHAR FOR BIT DATA", "-4", "32700", "X'", "'", emptyList().toString(), "1", "false", "0", "true", - "false", "false", - "LONG VARCHAR FOR BIT DATA", null, null, null, null, null, null), - asList("VARCHAR () FOR BIT DATA", "-3", "32672", "X'", "'", singletonList("length").toString(), "1", "false", - "2", "true", "false", - "false", "VARCHAR () FOR BIT DATA", null, null, null, null, null, null), - asList("CHAR () FOR BIT DATA", "-2", "254", "X'", "'", singletonList("length").toString(), "1", "false", "2", - "true", "false", "false", - "CHAR () FOR BIT DATA", null, null, null, null, null, null), - asList("LONG VARCHAR", "-1", "32700", "'", "'", emptyList().toString(), "1", "true", "1", "true", "false", - "false", - "LONG VARCHAR", null, null, null, null, null, null), - asList("CHAR", "1", "254", "'", "'", singletonList("length").toString(), "1", "true", "3", "true", "false", - "false", "CHAR", null, null, - null, null, null, null), - asList("NUMERIC", "2", "31", null, null, Arrays.asList("precision", "scale").toString(), "1", "false", "2", - "false", "true", "false", - "NUMERIC", "0", "31", null, null, "10", null), - asList("DECIMAL", "3", "31", null, null, Arrays.asList("precision", "scale").toString(), "1", "false", "2", - "false", "true", "false", - "DECIMAL", "0", "31", null, null, "10", null), - asList("INTEGER", "4", "10", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", - "INTEGER", "0", "0", - null, null, "10", null), - asList("SMALLINT", "5", "5", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", - "SMALLINT", "0", - "0", null, null, "10", null), - asList("FLOAT", "6", "52", null, null, singletonList("precision").toString(), "1", "false", "2", "false", - "false", "false", "FLOAT", null, - null, null, null, "2", null), - asList("REAL", "7", "23", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "false", - "REAL", null, null, - null, null, "2", null), - asList("DOUBLE", "8", "52", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "false", - "DOUBLE", null, - null, null, null, "2", null), - asList("VARCHAR", "12", "32672", "'", "'", singletonList("length").toString(), "1", "true", "3", "true", - "false", "false", "VARCHAR", - null, null, null, null, null, null), - asList("BOOLEAN", "16", "1", null, null, emptyList().toString(), "1", "false", "2", "true", "false", "false", - "BOOLEAN", null, - null, null, null, null, null), - asList("DATE", "91", "10", "DATE'", "'", emptyList().toString(), "1", "false", "2", "true", "false", "false", - "DATE", "0", "0", - null, null, "10", null), - asList("TIME", "92", "8", "TIME'", "'", emptyList().toString(), "1", "false", "2", "true", "false", "false", - "TIME", "0", "0", - null, null, "10", null), - asList("TIMESTAMP", "93", "29", "TIMESTAMP'", "'", emptyList().toString(), "1", "false", "2", "true", "false", - "false", - "TIMESTAMP", "0", "9", null, null, "10", null), - asList("OBJECT", "2000", null, null, null, emptyList().toString(), "1", "false", "2", "true", "false", "false", - "OBJECT", null, - null, null, null, null, null), - asList("BLOB", "2004", "2147483647", null, null, singletonList("length").toString(), "1", "false", "0", null, - "false", null, "BLOB", null, - null, null, null, null, null), - asList("CLOB", "2005", "2147483647", "'", "'", singletonList("length").toString(), "1", "true", "1", null, - "false", null, "CLOB", null, - null, null, null, null, null), - asList("XML", "2009", null, null, null, emptyList().toString(), "1", "true", "0", "false", "false", "false", - "XML", null, null, - null, null, null, null)); - MatcherAssert.assertThat(results, is(matchers)); + try (FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) { + + final List> results = getResults(stream); + + final List> matchers = ImmutableList.of( + asList("BIGINT", "-5", "19", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", + "BIGINT", "0", "0", + null, null, "10", null), + asList("LONG VARCHAR FOR BIT DATA", "-4", "32700", "X'", "'", emptyList().toString(), "1", "false", "0", + "true", "false", "false", + "LONG VARCHAR FOR BIT DATA", null, null, null, null, null, null), + asList("VARCHAR () FOR BIT DATA", "-3", "32672", "X'", "'", singletonList("length").toString(), "1", "false", + "2", "true", "false", + "false", "VARCHAR () FOR BIT DATA", null, null, null, null, null, null), + asList("CHAR () FOR BIT DATA", "-2", "254", "X'", "'", singletonList("length").toString(), "1", "false", "2", + "true", "false", "false", + "CHAR () FOR BIT DATA", null, null, null, null, null, null), + asList("LONG VARCHAR", "-1", "32700", "'", "'", emptyList().toString(), "1", "true", "1", "true", "false", + "false", + "LONG VARCHAR", null, null, null, null, null, null), + asList("CHAR", "1", "254", "'", "'", singletonList("length").toString(), "1", "true", "3", "true", "false", + "false", "CHAR", null, null, + null, null, null, null), + asList("NUMERIC", "2", "31", null, null, Arrays.asList("precision", "scale").toString(), "1", "false", "2", + "false", "true", "false", + "NUMERIC", "0", "31", null, null, "10", null), + asList("DECIMAL", "3", "31", null, null, Arrays.asList("precision", "scale").toString(), "1", "false", "2", + "false", "true", "false", + "DECIMAL", "0", "31", null, null, "10", null), + asList("INTEGER", "4", "10", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", + "INTEGER", "0", "0", + null, null, "10", null), + asList("SMALLINT", "5", "5", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", + "SMALLINT", "0", + "0", null, null, "10", null), + asList("FLOAT", "6", "52", null, null, singletonList("precision").toString(), "1", "false", "2", "false", + "false", "false", "FLOAT", null, + null, null, null, "2", null), + asList("REAL", "7", "23", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "false", + "REAL", null, null, + null, null, "2", null), + asList("DOUBLE", "8", "52", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "false", + "DOUBLE", null, + null, null, null, "2", null), + asList("VARCHAR", "12", "32672", "'", "'", singletonList("length").toString(), "1", "true", "3", "true", + "false", "false", "VARCHAR", + null, null, null, null, null, null), + asList("BOOLEAN", "16", "1", null, null, emptyList().toString(), "1", "false", "2", "true", "false", "false", + "BOOLEAN", null, + null, null, null, null, null), + asList("DATE", "91", "10", "DATE'", "'", emptyList().toString(), "1", "false", "2", "true", "false", "false", + "DATE", "0", "0", + null, null, "10", null), + asList("TIME", "92", "8", "TIME'", "'", emptyList().toString(), "1", "false", "2", "true", "false", "false", + "TIME", "0", "0", + null, null, "10", null), + asList("TIMESTAMP", "93", "29", "TIMESTAMP'", "'", emptyList().toString(), "1", "false", "2", "true", "false", + "false", + "TIMESTAMP", "0", "9", null, null, "10", null), + asList("OBJECT", "2000", null, null, null, emptyList().toString(), "1", "false", "2", "true", "false", + "false", "OBJECT", null, + null, null, null, null, null), + asList("BLOB", "2004", "2147483647", null, null, singletonList("length").toString(), "1", "false", "0", null, + "false", null, "BLOB", null, + null, null, null, null, null), + asList("CLOB", "2005", "2147483647", "'", "'", singletonList("length").toString(), "1", "true", "1", null, + "false", null, "CLOB", null, + null, null, null, null, null), + asList("XML", "2009", null, null, null, emptyList().toString(), "1", "true", "0", "false", "false", "false", + "XML", null, null, + null, null, null, null)); + MatcherAssert.assertThat(results, is(matchers)); + } } @Test - public void testGetTypeInfoWithFiltering() { + public void testGetTypeInfoWithFiltering() throws Exception { FlightInfo flightInfo = sqlClient.getXdbcTypeInfo(-5); - FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket()); + try (FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) { - final List> results = getResults(stream); + final List> results = getResults(stream); - final List> matchers = ImmutableList.of( - asList("BIGINT", "-5", "19", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", - "BIGINT", "0", "0", - null, null, "10", null)); - MatcherAssert.assertThat(results, is(matchers)); + final List> matchers = ImmutableList.of( + asList("BIGINT", "-5", "19", null, null, emptyList().toString(), "1", "false", "2", "false", "false", "true", + "BIGINT", "0", "0", + null, null, "10", null)); + MatcherAssert.assertThat(results, is(matchers)); + } } @Test - public void testGetCommandCrossReference() { + public void testGetCommandCrossReference() throws Exception { final FlightInfo flightInfo = sqlClient.getCrossReference(TableRef.of(null, null, "FOREIGNTABLE"), TableRef.of(null, null, "INTTABLE")); - final FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket()); - - final List> results = getResults(stream); - - final List> matchers = asList( - nullValue(String.class), // pk_catalog_name - is("APP"), // pk_schema_name - is("FOREIGNTABLE"), // pk_table_name - is("ID"), // pk_column_name - nullValue(String.class), // fk_catalog_name - is("APP"), // fk_schema_name - is("INTTABLE"), // fk_table_name - is("FOREIGNID"), // fk_column_name - is("1"), // key_sequence - containsString("SQL"), // fk_key_name - containsString("SQL"), // pk_key_name - is("3"), // update_rule - is("3")); // delete_rule - - Assertions.assertEquals(1, results.size()); - final List assertions = new ArrayList<>(); - for (int i = 0; i < matchers.size(); i++) { - final String actual = results.get(0).get(i); - final Matcher expected = matchers.get(i); - assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + try (final FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) { + + final List> results = getResults(stream); + + final List> matchers = asList( + nullValue(String.class), // pk_catalog_name + is("APP"), // pk_schema_name + is("FOREIGNTABLE"), // pk_table_name + is("ID"), // pk_column_name + nullValue(String.class), // fk_catalog_name + is("APP"), // fk_schema_name + is("INTTABLE"), // fk_table_name + is("FOREIGNID"), // fk_column_name + is("1"), // key_sequence + containsString("SQL"), // fk_key_name + containsString("SQL"), // pk_key_name + is("3"), // update_rule + is("3")); // delete_rule + + Assertions.assertEquals(1, results.size()); + final List assertions = new ArrayList<>(); + for (int i = 0; i < matchers.size(); i++) { + final String actual = results.get(0).get(i); + final Matcher expected = matchers.get(i); + assertions.add(() -> MatcherAssert.assertThat(actual, expected)); + } + Assertions.assertAll(assertions); } - Assertions.assertAll(assertions); } @Test @@ -878,90 +870,6 @@ public void testCreateStatementResults() throws Exception { } } - List> getResults(FlightStream stream) { - final List> results = new ArrayList<>(); - while (stream.next()) { - try (final VectorSchemaRoot root = stream.getRoot()) { - final long rowCount = root.getRowCount(); - for (int i = 0; i < rowCount; ++i) { - results.add(new ArrayList<>()); - } - - root.getSchema().getFields().forEach(field -> { - try (final FieldVector fieldVector = root.getVector(field.getName())) { - if (fieldVector instanceof VarCharVector) { - final VarCharVector varcharVector = (VarCharVector) fieldVector; - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - final Text data = varcharVector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : data.toString()); - } - } else if (fieldVector instanceof IntVector) { - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - Object data = fieldVector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); - } - } else if (fieldVector instanceof VarBinaryVector) { - final VarBinaryVector varbinaryVector = (VarBinaryVector) fieldVector; - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - final byte[] data = varbinaryVector.getObject(rowIndex); - final String output; - try { - output = isNull(data) ? - null : - MessageSerializer.deserializeSchema( - new ReadChannel(Channels.newChannel(new ByteArrayInputStream(data)))).toJson(); - } catch (final IOException e) { - throw new RuntimeException("Failed to deserialize schema", e); - } - results.get(rowIndex).add(output); - } - } else if (fieldVector instanceof DenseUnionVector) { - final DenseUnionVector denseUnionVector = (DenseUnionVector) fieldVector; - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - final Object data = denseUnionVector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); - } - } else if (fieldVector instanceof ListVector) { - for (int i = 0; i < fieldVector.getValueCount(); i++) { - if (!fieldVector.isNull(i)) { - List elements = (List) ((ListVector) fieldVector).getObject(i); - List values = new ArrayList<>(); - - for (Text element : elements) { - values.add(element.toString()); - } - results.get(i).add(values.toString()); - } - } - - } else if (fieldVector instanceof UInt4Vector) { - final UInt4Vector uInt4Vector = (UInt4Vector) fieldVector; - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - final Object data = uInt4Vector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); - } - } else if (fieldVector instanceof UInt1Vector) { - final UInt1Vector uInt1Vector = (UInt1Vector) fieldVector; - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - final Object data = uInt1Vector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); - } - } else if (fieldVector instanceof BitVector) { - for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { - Object data = fieldVector.getObject(rowIndex); - results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); - } - } else { - throw new UnsupportedOperationException("Not yet implemented"); - } - } - }); - } - } - - return results; - } - @Test public void testExecuteUpdate() { Assertions.assertAll( diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSqlStreams.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSqlStreams.java new file mode 100644 index 0000000000000..4672e0a141832 --- /dev/null +++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/TestFlightSqlStreams.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static org.apache.arrow.flight.sql.util.FlightStreamUtils.getResults; +import static org.apache.arrow.util.AutoCloseables.close; +import static org.apache.arrow.vector.types.Types.MinorType.INT; +import static org.hamcrest.CoreMatchers.is; + +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.flight.sql.BasicFlightSqlProducer; +import org.apache.arrow.flight.sql.FlightSqlClient; +import org.apache.arrow.flight.sql.FlightSqlProducer; +import org.apache.arrow.flight.sql.impl.FlightSql; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; +import org.hamcrest.MatcherAssert; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import com.google.common.collect.ImmutableList; +import com.google.protobuf.Any; +import com.google.protobuf.Message; + +public class TestFlightSqlStreams { + + /** + * A limited {@link FlightSqlProducer} for testing GetTables, GetTableTypes, GetSqlInfo, and limited SQL commands. + */ + private static class FlightSqlTestProducer extends BasicFlightSqlProducer { + + // Note that for simplicity the getStream* implementations are blocking, but a proper FlightSqlProducer should + // have non-blocking implementations of getStream*. + + private static final String FIXED_QUERY = "SELECT 1 AS c1 FROM test_table"; + private static final Schema FIXED_SCHEMA = new Schema(asList( + Field.nullable("c1", Types.MinorType.INT.getType()))); + + private BufferAllocator allocator; + + FlightSqlTestProducer(BufferAllocator allocator) { + this.allocator = allocator; + } + + @Override + protected List determineEndpoints(T request, FlightDescriptor flightDescriptor, + Schema schema) { + if (request instanceof FlightSql.CommandGetTables || + request instanceof FlightSql.CommandGetTableTypes || + request instanceof FlightSql.CommandGetXdbcTypeInfo || + request instanceof FlightSql.CommandGetSqlInfo) { + return Collections.singletonList(new FlightEndpoint(new Ticket(Any.pack(request).toByteArray()))); + } else if (request instanceof FlightSql.CommandStatementQuery && + ((FlightSql.CommandStatementQuery) request).getQuery().equals(FIXED_QUERY)) { + + // Tickets from CommandStatementQuery requests should be built using TicketStatementQuery then packed() into + // a ticket. The content of the statement handle is specific to the FlightSqlProducer. It does not need to + // be the query. It can be a query ID for example. + FlightSql.TicketStatementQuery ticketStatementQuery = FlightSql.TicketStatementQuery.newBuilder() + .setStatementHandle(((FlightSql.CommandStatementQuery) request).getQueryBytes()) + .build(); + return Collections.singletonList(new FlightEndpoint(new Ticket(Any.pack(ticketStatementQuery).toByteArray()))); + } + throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException(); + } + + @Override + public FlightInfo getFlightInfoStatement(FlightSql.CommandStatementQuery command, + CallContext context, FlightDescriptor descriptor) { + return generateFlightInfo(command, descriptor, FIXED_SCHEMA); + } + + @Override + public void getStreamStatement(FlightSql.TicketStatementQuery ticket, + CallContext context, ServerStreamListener listener) { + final String query = ticket.getStatementHandle().toStringUtf8(); + if (!query.equals(FIXED_QUERY)) { + listener.error(CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException()); + } + + try (VectorSchemaRoot root = VectorSchemaRoot.create(FIXED_SCHEMA, allocator)) { + root.setRowCount(1); + ((IntVector) root.getVector("c1")).setSafe(0, 1); + listener.start(root); + listener.putNext(); + listener.completed(); + } + } + + @Override + public void getStreamSqlInfo(FlightSql.CommandGetSqlInfo command, CallContext context, + ServerStreamListener listener) { + try (VectorSchemaRoot root = VectorSchemaRoot.create(Schemas.GET_SQL_INFO_SCHEMA, allocator)) { + root.setRowCount(0); + listener.start(root); + listener.putNext(); + listener.completed(); + } + } + + @Override + public void getStreamTypeInfo(FlightSql.CommandGetXdbcTypeInfo request, + CallContext context, ServerStreamListener listener) { + try (VectorSchemaRoot root = VectorSchemaRoot.create(Schemas.GET_TYPE_INFO_SCHEMA, allocator)) { + root.setRowCount(1); + ((VarCharVector) root.getVector("type_name")).setSafe(0, new Text("Integer")); + ((IntVector) root.getVector("data_type")).setSafe(0, INT.ordinal()); + ((IntVector) root.getVector("column_size")).setSafe(0, 400); + root.getVector("literal_prefix").setNull(0); + root.getVector("literal_suffix").setNull(0); + root.getVector("create_params").setNull(0); + ((IntVector) root.getVector("nullable")).setSafe(0, FlightSql.Nullable.NULLABILITY_NULLABLE.getNumber()); + ((BitVector) root.getVector("case_sensitive")).setSafe(0, 1); + ((IntVector) root.getVector("nullable")).setSafe(0, FlightSql.Searchable.SEARCHABLE_FULL.getNumber()); + ((BitVector) root.getVector("unsigned_attribute")).setSafe(0, 1); + root.getVector("fixed_prec_scale").setNull(0); + ((BitVector) root.getVector("auto_increment")).setSafe(0, 1); + ((VarCharVector) root.getVector("local_type_name")).setSafe(0, new Text("Integer")); + root.getVector("minimum_scale").setNull(0); + root.getVector("maximum_scale").setNull(0); + ((IntVector) root.getVector("sql_data_type")).setSafe(0, INT.ordinal()); + root.getVector("datetime_subcode").setNull(0); + ((IntVector) root.getVector("num_prec_radix")).setSafe(0, 10); + root.getVector("interval_precision").setNull(0); + + listener.start(root); + listener.putNext(); + listener.completed(); + } + } + + @Override + public void getStreamTables(FlightSql.CommandGetTables command, CallContext context, + ServerStreamListener listener) { + try (VectorSchemaRoot root = VectorSchemaRoot.create(Schemas.GET_TABLES_SCHEMA_NO_SCHEMA, allocator)) { + root.setRowCount(1); + root.getVector("catalog_name").setNull(0); + root.getVector("db_schema_name").setNull(0); + ((VarCharVector) root.getVector("table_name")).setSafe(0, new Text("test_table")); + ((VarCharVector) root.getVector("table_type")).setSafe(0, new Text("TABLE")); + + listener.start(root); + listener.putNext(); + listener.completed(); + } + } + + @Override + public void getStreamTableTypes(CallContext context, ServerStreamListener listener) { + try (VectorSchemaRoot root = VectorSchemaRoot.create(Schemas.GET_TABLE_TYPES_SCHEMA, allocator)) { + root.setRowCount(1); + ((VarCharVector) root.getVector("table_type")).setSafe(0, new Text("TABLE")); + + listener.start(root); + listener.putNext(); + listener.completed(); + } + } + } + + private static BufferAllocator allocator; + + private static FlightServer server; + private static FlightSqlClient sqlClient; + + @BeforeAll + public static void setUp() throws Exception { + allocator = new RootAllocator(Integer.MAX_VALUE); + + final Location serverLocation = Location.forGrpcInsecure("localhost", 0); + server = FlightServer.builder(allocator, serverLocation, new FlightSqlTestProducer(allocator)) + .build() + .start(); + + final Location clientLocation = Location.forGrpcInsecure("localhost", server.getPort()); + sqlClient = new FlightSqlClient(FlightClient.builder(allocator, clientLocation).build()); + } + + @AfterAll + public static void tearDown() throws Exception { + close(sqlClient, server, allocator); + } + + @Test + public void testGetTablesResultNoSchema() throws Exception { + try (final FlightStream stream = + sqlClient.getStream( + sqlClient.getTables(null, null, null, null, false) + .getEndpoints().get(0).getTicket())) { + Assertions.assertAll( + () -> MatcherAssert.assertThat(stream.getSchema(), is(FlightSqlProducer.Schemas.GET_TABLES_SCHEMA_NO_SCHEMA)), + () -> { + final List> results = getResults(stream); + final List> expectedResults = ImmutableList.of( + // catalog_name | schema_name | table_name | table_type | table_schema + asList(null, null, "test_table", "TABLE")); + MatcherAssert.assertThat(results, is(expectedResults)); + } + ); + } + } + + @Test + public void testGetTableTypesResult() throws Exception { + try (final FlightStream stream = + sqlClient.getStream(sqlClient.getTableTypes().getEndpoints().get(0).getTicket())) { + Assertions.assertAll( + () -> MatcherAssert.assertThat(stream.getSchema(), is(FlightSqlProducer.Schemas.GET_TABLE_TYPES_SCHEMA)), + () -> { + final List> tableTypes = getResults(stream); + final List> expectedTableTypes = ImmutableList.of( + // table_type + singletonList("TABLE") + ); + MatcherAssert.assertThat(tableTypes, is(expectedTableTypes)); + } + ); + } + } + + @Test + public void testGetSqlInfoResults() throws Exception { + final FlightInfo info = sqlClient.getSqlInfo(); + try (final FlightStream stream = sqlClient.getStream(info.getEndpoints().get(0).getTicket())) { + Assertions.assertAll( + () -> MatcherAssert.assertThat(stream.getSchema(), is(FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA)), + () -> MatcherAssert.assertThat(getResults(stream), is(emptyList())) + ); + } + } + + @Test + public void testGetTypeInfo() throws Exception { + FlightInfo flightInfo = sqlClient.getXdbcTypeInfo(); + + try (FlightStream stream = sqlClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) { + + final List> results = getResults(stream); + + final List> matchers = ImmutableList.of( + asList("Integer", "4", "400", null, null, "3", "true", null, "true", null, "true", + "Integer", null, null, "4", null, "10", null)); + + MatcherAssert.assertThat(results, is(matchers)); + } + } + + @Test + public void testExecuteQuery() throws Exception { + try (final FlightStream stream = sqlClient + .getStream(sqlClient.execute(FlightSqlTestProducer.FIXED_QUERY).getEndpoints().get(0).getTicket())) { + Assertions.assertAll( + () -> MatcherAssert.assertThat(stream.getSchema(), is(FlightSqlTestProducer.FIXED_SCHEMA)), + () -> MatcherAssert.assertThat(getResults(stream), is(singletonList(singletonList("1")))) + ); + } + } +} diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/util/FlightStreamUtils.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/util/FlightStreamUtils.java new file mode 100644 index 0000000000000..fbbe9ef01816e --- /dev/null +++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/util/FlightStreamUtils.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.sql.util; + +import static java.util.Objects.isNull; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.util.Text; + +public class FlightStreamUtils { + + public static List> getResults(FlightStream stream) { + final List> results = new ArrayList<>(); + while (stream.next()) { + try (final VectorSchemaRoot root = stream.getRoot()) { + final long rowCount = root.getRowCount(); + for (int i = 0; i < rowCount; ++i) { + results.add(new ArrayList<>()); + } + + root.getSchema().getFields().forEach(field -> { + try (final FieldVector fieldVector = root.getVector(field.getName())) { + if (fieldVector instanceof VarCharVector) { + final VarCharVector varcharVector = (VarCharVector) fieldVector; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + final Text data = varcharVector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : data.toString()); + } + } else if (fieldVector instanceof IntVector) { + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + Object data = fieldVector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); + } + } else if (fieldVector instanceof VarBinaryVector) { + final VarBinaryVector varbinaryVector = (VarBinaryVector) fieldVector; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + final byte[] data = varbinaryVector.getObject(rowIndex); + final String output; + try { + output = isNull(data) ? + null : + MessageSerializer.deserializeSchema( + new ReadChannel(Channels.newChannel(new ByteArrayInputStream(data)))).toJson(); + } catch (final IOException e) { + throw new RuntimeException("Failed to deserialize schema", e); + } + results.get(rowIndex).add(output); + } + } else if (fieldVector instanceof DenseUnionVector) { + final DenseUnionVector denseUnionVector = (DenseUnionVector) fieldVector; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + final Object data = denseUnionVector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); + } + } else if (fieldVector instanceof ListVector) { + for (int i = 0; i < fieldVector.getValueCount(); i++) { + if (!fieldVector.isNull(i)) { + List elements = (List) ((ListVector) fieldVector).getObject(i); + List values = new ArrayList<>(); + + for (Text element : elements) { + values.add(element.toString()); + } + results.get(i).add(values.toString()); + } + } + + } else if (fieldVector instanceof UInt4Vector) { + final UInt4Vector uInt4Vector = (UInt4Vector) fieldVector; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + final Object data = uInt4Vector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); + } + } else if (fieldVector instanceof UInt1Vector) { + final UInt1Vector uInt1Vector = (UInt1Vector) fieldVector; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + final Object data = uInt1Vector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); + } + } else if (fieldVector instanceof BitVector) { + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + Object data = fieldVector.getObject(rowIndex); + results.get(rowIndex).add(isNull(data) ? null : Objects.toString(data)); + } + } else { + throw new UnsupportedOperationException("Not yet implemented"); + } + } + }); + } + } + + return results; + } +} diff --git a/java/gandiva/CMakeLists.txt b/java/gandiva/CMakeLists.txt index 629ab2fb347d8..2aa8d92959e42 100644 --- a/java/gandiva/CMakeLists.txt +++ b/java/gandiva/CMakeLists.txt @@ -29,21 +29,21 @@ add_jar(arrow_java_jni_gandiva_jar arrow_java_jni_gandiva_headers) set(GANDIVA_PROTO_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(GANDIVA_PROTO_OUTPUT_FILES "${GANDIVA_PROTO_OUTPUT_DIR}/Types.pb.cc" - "${GANDIVA_PROTO_OUTPUT_DIR}/Types.pb.h") +set(GANDIVA_PROTO_OUTPUT_FILES "${GANDIVA_PROTO_OUTPUT_DIR}/gandiva/types.pb.cc" + "${GANDIVA_PROTO_OUTPUT_DIR}/gandiva/types.pb.h") set_source_files_properties(${GANDIVA_PROTO_OUTPUT_FILES} PROPERTIES GENERATED TRUE) set(GANDIVA_PROTO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/proto) -get_filename_component(GANDIVA_PROTO_FILE_ABSOLUTE ${GANDIVA_PROTO_DIR}/Types.proto - ABSOLUTE) +get_filename_component(GANDIVA_PROTO_FILE_ABSOLUTE + ${GANDIVA_PROTO_DIR}/gandiva/types.proto ABSOLUTE) find_package(Protobuf REQUIRED) add_custom_command(OUTPUT ${GANDIVA_PROTO_OUTPUT_FILES} COMMAND protobuf::protoc --proto_path ${GANDIVA_PROTO_DIR} --cpp_out ${GANDIVA_PROTO_OUTPUT_DIR} ${GANDIVA_PROTO_FILE_ABSOLUTE} DEPENDS ${GANDIVA_PROTO_FILE_ABSOLUTE} - COMMENT "Running Protobuf compiler on Types.proto" + COMMENT "Running Protobuf compiler on gandiva/types.proto" VERBATIM) add_custom_target(garrow_java_jni_gandiva_proto ALL DEPENDS ${GANDIVA_PROTO_OUTPUT_FILES}) diff --git a/java/gandiva/proto/Types.proto b/java/gandiva/proto/gandiva/types.proto similarity index 99% rename from java/gandiva/proto/Types.proto rename to java/gandiva/proto/gandiva/types.proto index eb0d996b92e63..4ce342681d614 100644 --- a/java/gandiva/proto/Types.proto +++ b/java/gandiva/proto/gandiva/types.proto @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -syntax = "proto2"; -package types; +syntax = "proto3"; +package gandiva.types; option java_package = "org.apache.arrow.gandiva.ipc"; option java_outer_classname = "GandivaTypes"; diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 6765df3b9727f..66b97c8b9ef44 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -20,121 +20,120 @@ #include #include #include - -#include "Types.pb.h" -#include "org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper.h" +#include +#include using gandiva::DataTypePtr; using gandiva::ExpressionRegistry; -types::TimeUnit MapTimeUnit(arrow::TimeUnit::type& unit) { +gandiva::types::TimeUnit MapTimeUnit(arrow::TimeUnit::type& unit) { switch (unit) { case arrow::TimeUnit::MILLI: - return types::TimeUnit::MILLISEC; + return gandiva::types::TimeUnit::MILLISEC; case arrow::TimeUnit::SECOND: - return types::TimeUnit::SEC; + return gandiva::types::TimeUnit::SEC; case arrow::TimeUnit::MICRO: - return types::TimeUnit::MICROSEC; + return gandiva::types::TimeUnit::MICROSEC; case arrow::TimeUnit::NANO: - return types::TimeUnit::NANOSEC; + return gandiva::types::TimeUnit::NANOSEC; } // satisfy gcc. should be unreachable. - return types::TimeUnit::SEC; + return gandiva::types::TimeUnit::SEC; } -void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) { +void ArrowToProtobuf(DataTypePtr type, gandiva::types::ExtGandivaType* gandiva_data_type) { switch (type->id()) { case arrow::Type::BOOL: - gandiva_data_type->set_type(types::GandivaType::BOOL); + gandiva_data_type->set_type(gandiva::types::GandivaType::BOOL); break; case arrow::Type::UINT8: - gandiva_data_type->set_type(types::GandivaType::UINT8); + gandiva_data_type->set_type(gandiva::types::GandivaType::UINT8); break; case arrow::Type::INT8: - gandiva_data_type->set_type(types::GandivaType::INT8); + gandiva_data_type->set_type(gandiva::types::GandivaType::INT8); break; case arrow::Type::UINT16: - gandiva_data_type->set_type(types::GandivaType::UINT16); + gandiva_data_type->set_type(gandiva::types::GandivaType::UINT16); break; case arrow::Type::INT16: - gandiva_data_type->set_type(types::GandivaType::INT16); + gandiva_data_type->set_type(gandiva::types::GandivaType::INT16); break; case arrow::Type::UINT32: - gandiva_data_type->set_type(types::GandivaType::UINT32); + gandiva_data_type->set_type(gandiva::types::GandivaType::UINT32); break; case arrow::Type::INT32: - gandiva_data_type->set_type(types::GandivaType::INT32); + gandiva_data_type->set_type(gandiva::types::GandivaType::INT32); break; case arrow::Type::UINT64: - gandiva_data_type->set_type(types::GandivaType::UINT64); + gandiva_data_type->set_type(gandiva::types::GandivaType::UINT64); break; case arrow::Type::INT64: - gandiva_data_type->set_type(types::GandivaType::INT64); + gandiva_data_type->set_type(gandiva::types::GandivaType::INT64); break; case arrow::Type::HALF_FLOAT: - gandiva_data_type->set_type(types::GandivaType::HALF_FLOAT); + gandiva_data_type->set_type(gandiva::types::GandivaType::HALF_FLOAT); break; case arrow::Type::FLOAT: - gandiva_data_type->set_type(types::GandivaType::FLOAT); + gandiva_data_type->set_type(gandiva::types::GandivaType::FLOAT); break; case arrow::Type::DOUBLE: - gandiva_data_type->set_type(types::GandivaType::DOUBLE); + gandiva_data_type->set_type(gandiva::types::GandivaType::DOUBLE); break; case arrow::Type::STRING: - gandiva_data_type->set_type(types::GandivaType::UTF8); + gandiva_data_type->set_type(gandiva::types::GandivaType::UTF8); break; case arrow::Type::BINARY: - gandiva_data_type->set_type(types::GandivaType::BINARY); + gandiva_data_type->set_type(gandiva::types::GandivaType::BINARY); break; case arrow::Type::DATE32: - gandiva_data_type->set_type(types::GandivaType::DATE32); + gandiva_data_type->set_type(gandiva::types::GandivaType::DATE32); break; case arrow::Type::DATE64: - gandiva_data_type->set_type(types::GandivaType::DATE64); + gandiva_data_type->set_type(gandiva::types::GandivaType::DATE64); break; case arrow::Type::TIMESTAMP: { - gandiva_data_type->set_type(types::GandivaType::TIMESTAMP); + gandiva_data_type->set_type(gandiva::types::GandivaType::TIMESTAMP); std::shared_ptr cast_time_stamp_type = std::dynamic_pointer_cast(type); arrow::TimeUnit::type unit = cast_time_stamp_type->unit(); - types::TimeUnit time_unit = MapTimeUnit(unit); + gandiva::types::TimeUnit time_unit = MapTimeUnit(unit); gandiva_data_type->set_timeunit(time_unit); break; } case arrow::Type::TIME32: { - gandiva_data_type->set_type(types::GandivaType::TIME32); + gandiva_data_type->set_type(gandiva::types::GandivaType::TIME32); std::shared_ptr cast_time_32_type = std::dynamic_pointer_cast(type); arrow::TimeUnit::type unit = cast_time_32_type->unit(); - types::TimeUnit time_unit = MapTimeUnit(unit); + gandiva::types::TimeUnit time_unit = MapTimeUnit(unit); gandiva_data_type->set_timeunit(time_unit); break; } case arrow::Type::TIME64: { - gandiva_data_type->set_type(types::GandivaType::TIME32); + gandiva_data_type->set_type(gandiva::types::GandivaType::TIME32); std::shared_ptr cast_time_64_type = std::dynamic_pointer_cast(type); arrow::TimeUnit::type unit = cast_time_64_type->unit(); - types::TimeUnit time_unit = MapTimeUnit(unit); + gandiva::types::TimeUnit time_unit = MapTimeUnit(unit); gandiva_data_type->set_timeunit(time_unit); break; } case arrow::Type::NA: - gandiva_data_type->set_type(types::GandivaType::NONE); + gandiva_data_type->set_type(gandiva::types::GandivaType::NONE); break; case arrow::Type::DECIMAL: { - gandiva_data_type->set_type(types::GandivaType::DECIMAL); + gandiva_data_type->set_type(gandiva::types::GandivaType::DECIMAL); gandiva_data_type->set_precision(0); gandiva_data_type->set_scale(0); break; } case arrow::Type::INTERVAL_MONTHS: - gandiva_data_type->set_type(types::GandivaType::INTERVAL); - gandiva_data_type->set_intervaltype(types::IntervalType::YEAR_MONTH); + gandiva_data_type->set_type(gandiva::types::GandivaType::INTERVAL); + gandiva_data_type->set_intervaltype(gandiva::types::IntervalType::YEAR_MONTH); break; case arrow::Type::INTERVAL_DAY_TIME: - gandiva_data_type->set_type(types::GandivaType::INTERVAL); - gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); + gandiva_data_type->set_type(gandiva::types::GandivaType::INTERVAL); + gandiva_data_type->set_intervaltype(gandiva::types::IntervalType::DAY_TIME); break; default: // un-supported types. test ensures that @@ -146,10 +145,10 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) JNIEXPORT jbyteArray JNICALL Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedDataTypes( // NOLINT JNIEnv* env, jobject types_helper) { - types::GandivaDataTypes gandiva_data_types; + gandiva::types::GandivaDataTypes gandiva_data_types; auto supported_types = ExpressionRegistry::supported_types(); for (auto const& type : supported_types) { - types::ExtGandivaType* gandiva_data_type = gandiva_data_types.add_datatype(); + gandiva::types::ExtGandivaType* gandiva_data_type = gandiva_data_types.add_datatype(); ArrowToProtobuf(type, gandiva_data_type); } auto size = static_cast(gandiva_data_types.ByteSizeLong()); @@ -169,15 +168,15 @@ JNIEXPORT jbyteArray JNICALL Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedFunctions( // NOLINT JNIEnv* env, jobject types_helper) { ExpressionRegistry expr_registry; - types::GandivaFunctions gandiva_functions; + gandiva::types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { - types::FunctionSignature* function_signature = gandiva_functions.add_function(); + gandiva::types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); - types::ExtGandivaType* return_type = function_signature->mutable_returntype(); + gandiva::types::ExtGandivaType* return_type = function_signature->mutable_returntype(); ArrowToProtobuf((*function).ret_type(), return_type); for (auto& param_type : (*function).param_types()) { - types::ExtGandivaType* proto_param_type = function_signature->add_paramtypes(); + gandiva::types::ExtGandivaType* proto_param_type = function_signature->add_paramtypes(); ArrowToProtobuf(param_type, proto_param_type); } } diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 43db266ff56f5..a5dff9981ce89 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -35,13 +35,13 @@ #include #include #include +#include +#include -#include "Types.pb.h" #include "config_holder.h" #include "env_helper.h" #include "id_to_module_map.h" #include "module_holder.h" -#include "org_apache_arrow_gandiva_evaluator_JniWrapper.h" using gandiva::ConditionPtr; using gandiva::DataTypePtr; @@ -65,7 +65,7 @@ using gandiva::FilterHolder; using gandiva::ProjectorHolder; // forward declarations -NodePtr ProtoTypeToNode(const types::TreeNode& node); +NodePtr ProtoTypeToNode(const gandiva::types::TreeNode& node); static jint JNI_VERSION = JNI_VERSION_1_6; @@ -131,11 +131,11 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(vector_expander_ret_class_); } -DataTypePtr ProtoTypeToTime32(const types::ExtGandivaType& ext_type) { +DataTypePtr ProtoTypeToTime32(const gandiva::types::ExtGandivaType& ext_type) { switch (ext_type.timeunit()) { - case types::SEC: + case gandiva::types::SEC: return arrow::time32(arrow::TimeUnit::SECOND); - case types::MILLISEC: + case gandiva::types::MILLISEC: return arrow::time32(arrow::TimeUnit::MILLI); default: std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for time32\n"; @@ -143,11 +143,11 @@ DataTypePtr ProtoTypeToTime32(const types::ExtGandivaType& ext_type) { } } -DataTypePtr ProtoTypeToTime64(const types::ExtGandivaType& ext_type) { +DataTypePtr ProtoTypeToTime64(const gandiva::types::ExtGandivaType& ext_type) { switch (ext_type.timeunit()) { - case types::MICROSEC: + case gandiva::types::MICROSEC: return arrow::time64(arrow::TimeUnit::MICRO); - case types::NANOSEC: + case gandiva::types::NANOSEC: return arrow::time64(arrow::TimeUnit::NANO); default: std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for time64\n"; @@ -155,15 +155,15 @@ DataTypePtr ProtoTypeToTime64(const types::ExtGandivaType& ext_type) { } } -DataTypePtr ProtoTypeToTimestamp(const types::ExtGandivaType& ext_type) { +DataTypePtr ProtoTypeToTimestamp(const gandiva::types::ExtGandivaType& ext_type) { switch (ext_type.timeunit()) { - case types::SEC: + case gandiva::types::SEC: return arrow::timestamp(arrow::TimeUnit::SECOND); - case types::MILLISEC: + case gandiva::types::MILLISEC: return arrow::timestamp(arrow::TimeUnit::MILLI); - case types::MICROSEC: + case gandiva::types::MICROSEC: return arrow::timestamp(arrow::TimeUnit::MICRO); - case types::NANOSEC: + case gandiva::types::NANOSEC: return arrow::timestamp(arrow::TimeUnit::NANO); default: std::cerr << "Unknown time unit: " << ext_type.timeunit() << " for timestamp\n"; @@ -171,11 +171,11 @@ DataTypePtr ProtoTypeToTimestamp(const types::ExtGandivaType& ext_type) { } } -DataTypePtr ProtoTypeToInterval(const types::ExtGandivaType& ext_type) { +DataTypePtr ProtoTypeToInterval(const gandiva::types::ExtGandivaType& ext_type) { switch (ext_type.intervaltype()) { - case types::YEAR_MONTH: + case gandiva::types::YEAR_MONTH: return arrow::month_interval(); - case types::DAY_TIME: + case gandiva::types::DAY_TIME: return arrow::day_time_interval(); default: std::cerr << "Unknown interval type: " << ext_type.intervaltype() << "\n"; @@ -183,59 +183,59 @@ DataTypePtr ProtoTypeToInterval(const types::ExtGandivaType& ext_type) { } } -DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { +DataTypePtr ProtoTypeToDataType(const gandiva::types::ExtGandivaType& ext_type) { switch (ext_type.type()) { - case types::NONE: + case gandiva::types::NONE: return arrow::null(); - case types::BOOL: + case gandiva::types::BOOL: return arrow::boolean(); - case types::UINT8: + case gandiva::types::UINT8: return arrow::uint8(); - case types::INT8: + case gandiva::types::INT8: return arrow::int8(); - case types::UINT16: + case gandiva::types::UINT16: return arrow::uint16(); - case types::INT16: + case gandiva::types::INT16: return arrow::int16(); - case types::UINT32: + case gandiva::types::UINT32: return arrow::uint32(); - case types::INT32: + case gandiva::types::INT32: return arrow::int32(); - case types::UINT64: + case gandiva::types::UINT64: return arrow::uint64(); - case types::INT64: + case gandiva::types::INT64: return arrow::int64(); - case types::HALF_FLOAT: + case gandiva::types::HALF_FLOAT: return arrow::float16(); - case types::FLOAT: + case gandiva::types::FLOAT: return arrow::float32(); - case types::DOUBLE: + case gandiva::types::DOUBLE: return arrow::float64(); - case types::UTF8: + case gandiva::types::UTF8: return arrow::utf8(); - case types::BINARY: + case gandiva::types::BINARY: return arrow::binary(); - case types::DATE32: + case gandiva::types::DATE32: return arrow::date32(); - case types::DATE64: + case gandiva::types::DATE64: return arrow::date64(); - case types::DECIMAL: + case gandiva::types::DECIMAL: // TODO: error handling return arrow::decimal(ext_type.precision(), ext_type.scale()); - case types::TIME32: + case gandiva::types::TIME32: return ProtoTypeToTime32(ext_type); - case types::TIME64: + case gandiva::types::TIME64: return ProtoTypeToTime64(ext_type); - case types::TIMESTAMP: + case gandiva::types::TIMESTAMP: return ProtoTypeToTimestamp(ext_type); - case types::INTERVAL: + case gandiva::types::INTERVAL: return ProtoTypeToInterval(ext_type); - case types::FIXED_SIZE_BINARY: - case types::LIST: - case types::STRUCT: - case types::UNION: - case types::DICTIONARY: - case types::MAP: + case gandiva::types::FIXED_SIZE_BINARY: + case gandiva::types::LIST: + case gandiva::types::STRUCT: + case gandiva::types::UNION: + case gandiva::types::DICTIONARY: + case gandiva::types::MAP: std::cerr << "Unhandled data type: " << ext_type.type() << "\n"; return nullptr; @@ -245,7 +245,7 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { } } -FieldPtr ProtoTypeToField(const types::Field& f) { +FieldPtr ProtoTypeToField(const gandiva::types::Field& f) { const std::string& name = f.name(); DataTypePtr type = ProtoTypeToDataType(f.type()); bool nullable = true; @@ -256,7 +256,7 @@ FieldPtr ProtoTypeToField(const types::Field& f) { return field(name, type, nullable); } -NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { +NodePtr ProtoTypeToFieldNode(const gandiva::types::FieldNode& node) { FieldPtr field_ptr = ProtoTypeToField(node.field()); if (field_ptr == nullptr) { std::cerr << "Unable to create field node from protobuf\n"; @@ -266,12 +266,12 @@ NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { return TreeExprBuilder::MakeField(field_ptr); } -NodePtr ProtoTypeToFnNode(const types::FunctionNode& node) { +NodePtr ProtoTypeToFnNode(const gandiva::types::FunctionNode& node) { const std::string& name = node.functionname(); NodeVector children; for (int i = 0; i < node.inargs_size(); i++) { - const types::TreeNode& arg = node.inargs(i); + const gandiva::types::TreeNode& arg = node.inargs(i); NodePtr n = ProtoTypeToNode(arg); if (n == nullptr) { @@ -291,7 +291,7 @@ NodePtr ProtoTypeToFnNode(const types::FunctionNode& node) { return TreeExprBuilder::MakeFunction(name, children, return_type); } -NodePtr ProtoTypeToIfNode(const types::IfNode& node) { +NodePtr ProtoTypeToIfNode(const gandiva::types::IfNode& node) { NodePtr cond = ProtoTypeToNode(node.cond()); if (cond == nullptr) { std::cerr << "Unable to create cond node for if node\n"; @@ -319,11 +319,11 @@ NodePtr ProtoTypeToIfNode(const types::IfNode& node) { return TreeExprBuilder::MakeIf(cond, then_node, else_node, return_type); } -NodePtr ProtoTypeToAndNode(const types::AndNode& node) { +NodePtr ProtoTypeToAndNode(const gandiva::types::AndNode& node) { NodeVector children; for (int i = 0; i < node.args_size(); i++) { - const types::TreeNode& arg = node.args(i); + const gandiva::types::TreeNode& arg = node.args(i); NodePtr n = ProtoTypeToNode(arg); if (n == nullptr) { @@ -335,11 +335,11 @@ NodePtr ProtoTypeToAndNode(const types::AndNode& node) { return TreeExprBuilder::MakeAnd(children); } -NodePtr ProtoTypeToOrNode(const types::OrNode& node) { +NodePtr ProtoTypeToOrNode(const gandiva::types::OrNode& node) { NodeVector children; for (int i = 0; i < node.args_size(); i++) { - const types::TreeNode& arg = node.args(i); + const gandiva::types::TreeNode& arg = node.args(i); NodePtr n = ProtoTypeToNode(arg); if (n == nullptr) { @@ -351,7 +351,7 @@ NodePtr ProtoTypeToOrNode(const types::OrNode& node) { return TreeExprBuilder::MakeOr(children); } -NodePtr ProtoTypeToInNode(const types::InNode& node) { +NodePtr ProtoTypeToInNode(const gandiva::types::InNode& node) { NodePtr field = ProtoTypeToNode(node.node()); if (node.has_intvalues()) { @@ -417,7 +417,7 @@ NodePtr ProtoTypeToInNode(const types::InNode& node) { return nullptr; } -NodePtr ProtoTypeToNullNode(const types::NullNode& node) { +NodePtr ProtoTypeToNullNode(const gandiva::types::NullNode& node) { DataTypePtr data_type = ProtoTypeToDataType(node.type()); if (data_type == nullptr) { std::cerr << "Unknown type " << data_type->ToString() << " for null node\n"; @@ -427,7 +427,7 @@ NodePtr ProtoTypeToNullNode(const types::NullNode& node) { return TreeExprBuilder::MakeNull(data_type); } -NodePtr ProtoTypeToNode(const types::TreeNode& node) { +NodePtr ProtoTypeToNode(const gandiva::types::TreeNode& node) { if (node.has_fieldnode()) { return ProtoTypeToFieldNode(node.fieldnode()); } @@ -494,7 +494,7 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { return nullptr; } -ExpressionPtr ProtoTypeToExpression(const types::ExpressionRoot& root) { +ExpressionPtr ProtoTypeToExpression(const gandiva::types::ExpressionRoot& root) { NodePtr root_node = ProtoTypeToNode(root.root()); if (root_node == nullptr) { std::cerr << "Unable to create expression node from expression protobuf\n"; @@ -510,7 +510,7 @@ ExpressionPtr ProtoTypeToExpression(const types::ExpressionRoot& root) { return TreeExprBuilder::MakeExpression(root_node, field); } -ConditionPtr ProtoTypeToCondition(const types::Condition& condition) { +ConditionPtr ProtoTypeToCondition(const gandiva::types::Condition& condition) { NodePtr root_node = ProtoTypeToNode(condition.root()); if (root_node == nullptr) { return nullptr; @@ -519,7 +519,7 @@ ConditionPtr ProtoTypeToCondition(const types::Condition& condition) { return TreeExprBuilder::MakeCondition(root_node); } -SchemaPtr ProtoTypeToSchema(const types::Schema& schema) { +SchemaPtr ProtoTypeToSchema(const gandiva::types::Schema& schema) { std::vector fields; for (int i = 0; i < schema.columns_size(); i++) { @@ -608,11 +608,11 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build std::shared_ptr projector; std::shared_ptr holder; - types::Schema schema; + gandiva::types::Schema schema; jsize schema_len = env->GetArrayLength(schema_arr); jbyte* schema_bytes = env->GetByteArrayElements(schema_arr, 0); - types::ExpressionList exprs; + gandiva::types::ExpressionList exprs; jsize exprs_len = env->GetArrayLength(exprs_arr); jbyte* exprs_bytes = env->GetByteArrayElements(exprs_arr, 0); @@ -643,7 +643,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build goto err_out; } - // convert types::Schema to arrow::Schema + // convert gandiva::types::Schema to arrow::Schema schema_ptr = ProtoTypeToSchema(schema); if (schema_ptr == nullptr) { ss << "Unable to construct arrow schema object from schema protobuf\n"; @@ -666,13 +666,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build } switch (selection_vector_type) { - case types::SV_NONE: + case gandiva::types::SV_NONE: mode = gandiva::SelectionVector::MODE_NONE; break; - case types::SV_INT16: + case gandiva::types::SV_INT16: mode = gandiva::SelectionVector::MODE_UINT16; break; - case types::SV_INT32: + case gandiva::types::SV_INT32: mode = gandiva::SelectionVector::MODE_UINT32; break; } @@ -809,17 +809,17 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( reinterpret_cast(sel_vec_addr), sel_vec_size); int output_row_count = 0; switch (sel_vec_type) { - case types::SV_NONE: { + case gandiva::types::SV_NONE: { output_row_count = num_rows; break; } - case types::SV_INT16: { + case gandiva::types::SV_INT16: { status = gandiva::SelectionVector::MakeImmutableInt16( sel_vec_rows, selection_buffer, &selection_vector); output_row_count = sel_vec_rows; break; } - case types::SV_INT32: { + case gandiva::types::SV_INT32: { status = gandiva::SelectionVector::MakeImmutableInt32( sel_vec_rows, selection_buffer, &selection_vector); output_row_count = sel_vec_rows; @@ -909,11 +909,11 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build std::shared_ptr filter; std::shared_ptr holder; - types::Schema schema; + gandiva::types::Schema schema; jsize schema_len = env->GetArrayLength(schema_arr); jbyte* schema_bytes = env->GetByteArrayElements(schema_arr, 0); - types::Condition condition; + gandiva::types::Condition condition; jsize condition_len = env->GetArrayLength(condition_arr); jbyte* condition_bytes = env->GetByteArrayElements(condition_arr, 0); @@ -943,7 +943,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build goto err_out; } - // convert types::Schema to arrow::Schema + // convert gandiva::types::Schema to arrow::Schema schema_ptr = ProtoTypeToSchema(schema); if (schema_ptr == nullptr) { ss << "Unable to construct arrow schema object from schema protobuf\n"; @@ -1008,15 +1008,15 @@ JNIEXPORT jint JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evalua } auto selection_vector_type = - static_cast(jselection_vector_type); + static_cast(jselection_vector_type); auto out_buffer = std::make_shared( reinterpret_cast(out_buf_addr), out_buf_size); switch (selection_vector_type) { - case types::SV_INT16: + case gandiva::types::SV_INT16: status = gandiva::SelectionVector::MakeInt16(num_rows, out_buffer, &selection_vector); break; - case types::SV_INT32: + case gandiva::types::SV_INT32: status = gandiva::SelectionVector::MakeInt32(num_rows, out_buffer, &selection_vector); break; diff --git a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java index 5e6580b6131c1..bb4ee45eaa073 100644 --- a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -142,6 +142,16 @@ public void write(${name}Holder holder) { } + <#if minor.class?ends_with("VarChar")> + public void write${minor.class}(${friendlyType} value) { + fail("${name}"); + } + + public void write${minor.class}(String value) { + fail("${name}"); + } + + public void writeNull() { diff --git a/java/vector/src/main/codegen/templates/ComplexWriters.java b/java/vector/src/main/codegen/templates/ComplexWriters.java index 4ae4c4f75f208..51a52a6e3070d 100644 --- a/java/vector/src/main/codegen/templates/ComplexWriters.java +++ b/java/vector/src/main/codegen/templates/ComplexWriters.java @@ -44,7 +44,11 @@ public class ${eName}WriterImpl extends AbstractFieldWriter { final ${name}Vector vector; - public ${eName}WriterImpl(${name}Vector vector) { +<#if minor.class?ends_with("VarChar")> + private final Text textBuffer = new Text(); + + +public ${eName}WriterImpl(${name}Vector vector) { this.vector = vector; } @@ -120,11 +124,19 @@ public void write(Nullable${minor.class}Holder h) { } - <#if minor.class == "VarChar"> + <#if minor.class?ends_with("VarChar")> + @Override public void write${minor.class}(${friendlyType} value) { vector.setSafe(idx(), value); vector.setValueCount(idx()+1); } + + @Override + public void write${minor.class}(String value) { + textBuffer.set(value); + vector.setSafe(idx(), textBuffer); + vector.setValueCount(idx()+1); + } <#if minor.class?starts_with("Decimal")> @@ -256,6 +268,11 @@ public interface ${eName}Writer extends BaseWriter { public void writeTo${minor.class}(ByteBuffer value, int offset, int length); +<#if minor.class?ends_with("VarChar")> + public void write${minor.class}(${friendlyType} value); + + public void write${minor.class}(String value); + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 223ae9aa8cb1c..04a038a0b5dfd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -550,6 +550,13 @@ private void setReaderAndWriterIndex() { } } + /** + * Validate the scalar values held by this vector. + */ + public void validateScalars() { + // No validation by default. + } + /** * Construct a transfer pair of this vector and another vector of same type. * @param ref name of the target vector diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 90694db830cd6..4d5a8a5119c53 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -643,6 +643,13 @@ public ArrowBuf[] getBuffers(boolean clear) { return buffers; } + /** + * Validate the scalar values held by this vector. + */ + public void validateScalars() { + // No validation by default. + } + /** * Construct a transfer pair of this vector and another vector of same type. * @param ref name of the target vector diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 2a89590bf8440..d7f5ff05a935d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -685,6 +685,13 @@ public ArrowBuf[] getBuffers(boolean clear) { return buffers; } + /** + * Validate the scalar values held by this vector. + */ + public void validateScalars() { + // No validation by default. + } + /** * Construct a transfer pair of this vector and another vector of same type. * @param ref name of the target vector diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java index 70a895ff40496..79a9badc3955d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java @@ -35,6 +35,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.DecimalUtility; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; /** @@ -527,6 +528,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) { set(index, isSet, start, buffer); } + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + BigDecimal value = getObject(i); + if (value != null) { + ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScaleNoThrow(value, getPrecision(), getScale()), + "Invalid value for Decimal256Vector at position " + i + ". Value does not fit in precision " + + getPrecision() + " and scale " + getScale() + "."); + } + } + } + /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java index 6a3ec60afc52e..d1a3bfc3afb10 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java @@ -35,6 +35,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.DecimalUtility; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; /** * DecimalVector implements a fixed width vector (16 bytes) of @@ -526,6 +527,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) { set(index, isSet, start, buffer); } + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + BigDecimal value = getObject(i); + if (value != null) { + ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScaleNoThrow(value, getPrecision(), getScale()), + "Invalid value for DecimalVector at position " + i + ". Value does not fit in precision " + + getPrecision() + " and scale " + getScale() + "."); + } + } + } + /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java index 3ce2bb77ccc55..967d560d78dea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; /** * FixedSizeBinaryVector implements a fixed width vector of @@ -320,6 +321,18 @@ public static byte[] get(final ArrowBuf buffer, final int index, final int byteW return dst; } + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + byte[] value = get(i); + if (value != null) { + ValidateUtil.validateOrThrow(value.length == byteWidth, + "Invalid value for FixedSizeBinaryVector at position " + i + ". The length was " + + value.length + " but the length of each element should be " + byteWidth + "."); + } + } + } + /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java index 1f8d9b7d3a85c..e9472c9f2c71e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -27,6 +27,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; /** * LargeVarCharVector implements a variable width vector of VARCHAR @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) { setSafe(index, text.getBytes(), 0, text.getLength()); } + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + byte[] value = get(i); + if (value != null) { + ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value), + "Non-UTF-8 data in VarCharVector at position " + i + "."); + } + } + } + /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index aa29c29314e33..462b512c65436 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; /** * An abstraction that is used to store a sequence of values in an individual column. @@ -282,4 +283,12 @@ public interface ValueVector extends Closeable, Iterable { * @return the name of the vector. */ String getName(); + + default void validate() { + ValueVectorUtility.validate(this); + } + + default void validateFull() { + ValueVectorUtility.validateFull(this); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index bc5c68b29f310..2c83893819a1e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; /** * VarCharVector implements a variable width vector of VARCHAR @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) { setSafe(index, text.getBytes(), 0, text.getLength()); } + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + byte[] value = get(i); + if (value != null) { + ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value), + "Non-UTF-8 data in VarCharVector at position " + i + "."); + } + } + } + /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 62d4a1299dead..95deceb4e75ca 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -205,6 +205,27 @@ public void setInitialCapacity(int numRecords, double density) { } } + /** + * Specialized version of setInitialTotalCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to to allow + * for in this vector across all records. + */ + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + vector.setInitialCapacity(totalNumberOfElements); + } + @Override public int getValueCapacity() { final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index 6ef5f994fc6f4..acb058cda3cb8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -196,6 +196,27 @@ public void setInitialCapacity(int numRecords, double density) { } } + /** + * Specialized version of setInitialTotalCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to to allow + * for in this vector across all records. + */ + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + vector.setInitialCapacity(totalNumberOfElements); + } + /** * Get the density of this ListVector. * @return density diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 52e5307e13908..0d6ff11f8ccf3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -148,6 +148,28 @@ public void setInitialCapacity(int numRecords, double density) { super.setInitialCapacity(numRecords, density); } + /** + * Specialized version of setInitialTotalCapacity() for ListVector. This is + * used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. This is + * very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. In + * such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to to allow + * for in this vector across all records. + */ + @Override + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialTotalCapacity(numRecords, totalNumberOfElements); + } + /** * Get the density of this ListVector. * @return density diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java index 4da2668121af6..7d724656cdab7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java @@ -374,6 +374,18 @@ public ValueVector getVectorById(int id) { return getChildByOrdinal(id); } + /** + * Gets a child vector by ordinal position and casts to the specified class. + */ + public V getVectorById(int id, Class clazz) { + ValueVector untyped = getVectorById(id); + if (clazz.isInstance(untyped)) { + return clazz.cast(untyped); + } + throw new ClassCastException("Id " + id + " had the wrong type. Expected " + clazz.getCanonicalName() + + " but was " + untyped.getClass().getCanonicalName()); + } + @Override public void setValueCount(int valueCount) { for (final ValueVector v : getChildren()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 137ac746f4aee..a81169b8f7d73 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -95,11 +95,19 @@ public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecisi } if (value.precision() > vectorPrecision) { throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " + - "vector: " + value.precision() + " > " + vectorPrecision); + "vector: " + value.precision() + " > " + vectorPrecision); } return true; } + /** + * Check that the BigDecimal scale equals the vectorScale and that the BigDecimal precision is + * less than or equal to the vectorPrecision. Return true if so, otherwise return false. + */ + public static boolean checkPrecisionAndScaleNoThrow(BigDecimal value, int vectorPrecision, int vectorScale) { + return value.scale() == vectorScale && value.precision() < vectorPrecision; + } + /** * Check that the decimal scale equals the vectorScale and that the decimal precision is * less than or equal to the vectorPrecision. If not, then an UnsupportedOperationException is diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index b479305c6e39b..778af0ca956df 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -30,6 +30,7 @@ import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.Arrays; +import java.util.Optional; import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.core.JsonGenerator; @@ -466,6 +467,16 @@ public static ByteBuffer encode(String string, boolean replace) private static final int TRAIL_BYTE = 2; + /** + * Check if a byte array contains valid utf-8. + * + * @param utf8 byte array + * @return true if the input is valid UTF-8. False otherwise. + */ + public static boolean validateUTF8NoThrow(byte[] utf8) { + return !validateUTF8Internal(utf8, 0, utf8.length).isPresent(); + } + /** * Check if a byte array contains valid utf-8. * @@ -484,8 +495,22 @@ public static void validateUTF8(byte[] utf8) throws MalformedInputException { * @param len the length of the byte sequence * @throws MalformedInputException if the byte array contains invalid bytes */ - public static void validateUTF8(byte[] utf8, int start, int len) - throws MalformedInputException { + public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { + Optional result = validateUTF8Internal(utf8, start, len); + if (result.isPresent()) { + throw new MalformedInputException(result.get()); + } + } + + /** + * Check to see if a byte array is valid utf-8. + * + * @param utf8 the array of bytes + * @param start the offset of the first byte in the array + * @param len the length of the byte sequence + * @return the position where a malformed byte occurred or Optional.empty() if the byte array was valid UTF-8. + */ + private static Optional validateUTF8Internal(byte[] utf8, int start, int len) { int count = start; int leadByte = 0; int length = 0; @@ -501,51 +526,51 @@ public static void validateUTF8(byte[] utf8, int start, int len) switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) { - throw new MalformedInputException(count); + return Optional.of(count); } break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) { - throw new MalformedInputException(count); + return Optional.of(count); } state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) { - throw new MalformedInputException(count); + return Optional.of(count); } state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) { - throw new MalformedInputException(count); + return Optional.of(count); } state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position - throw new MalformedInputException(count); + return Optional.of(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) { - throw new MalformedInputException(count); + return Optional.of(count); } if (leadByte == 0xF4 && aByte > 0x8F) { - throw new MalformedInputException(count); + return Optional.of(count); } if (leadByte == 0xE0 && aByte < 0xA0) { - throw new MalformedInputException(count); + return Optional.of(count); } if (leadByte == 0xED && aByte > 0x9F) { - throw new MalformedInputException(count); + return Optional.of(count); } // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) { - throw new MalformedInputException(count); + return Optional.of(count); } if (--length == 0) { state = LEAD_BYTE; @@ -558,6 +583,7 @@ public static void validateUTF8(byte[] utf8, int start, int len) } // switch (state) count++; } + return Optional.empty(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 9f73732ccfdd3..c5de380f9c173 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -116,7 +116,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { // make sure there is enough capacity while (targetVector.getValueCapacity() < newValueCount) { - targetVector.reAlloc(); + ((BaseVariableWidthVector) targetVector).reallocValidityAndOffsetBuffers(); } while (targetVector.getDataBuffer().capacity() < newValueCapacity) { ((BaseVariableWidthVector) targetVector).reallocDataBuffer(); @@ -170,7 +170,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { // make sure there is enough capacity while (targetVector.getValueCapacity() < newValueCount) { - targetVector.reAlloc(); + ((BaseLargeVariableWidthVector) targetVector).reallocValidityAndOffsetBuffers(); } while (targetVector.getDataBuffer().capacity() < newValueCapacity) { ((BaseLargeVariableWidthVector) targetVector).reallocDataBuffer(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java index cdeb4f1eaa1ca..6d33be7a0dbac 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -85,18 +85,21 @@ private void validateTypeBuffer(ArrowBuf typeBuf, int valueCount) { @Override public Void visit(BaseFixedWidthVector vector, Void value) { + vector.validateScalars(); return null; } @Override public Void visit(BaseVariableWidthVector vector, Void value) { validateOffsetBuffer(vector, vector.getValueCount()); + vector.validateScalars(); return null; } @Override public Void visit(BaseLargeVariableWidthVector vector, Void value) { validateLargeOffsetBuffer(vector, vector.getValueCount()); + vector.validateScalars(); return null; } @@ -169,6 +172,8 @@ public Void visit(DenseUnionVector vector, Void value) { @Override public Void visit(NullVector vector, Void value) { + ValidateUtil.validateOrThrow(vector.getNullCount() == vector.getValueCount(), + "NullVector should have only null entries."); return null; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java index c1d60da4d5988..adf86183c0ada 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java @@ -972,6 +972,26 @@ public void testIsEmpty() { } } + @Test + public void testTotalCapacity() { + final FieldType type = FieldType.nullable(MinorType.INT.getType()); + try (final LargeListVector vector = new LargeListVector("list", allocator, type, null)) { + // Force the child vector to be allocated based on the type + // (this is a bad API: we have to track and repeat the type twice) + vector.addOrGetVector(type); + + // Specify the allocation size but do not actually allocate + vector.setInitialTotalCapacity(10, 100); + + // Finally actually do the allocation + vector.allocateNewSafe(); + + // Note: allocator rounds up and can be greater than the requested allocation. + assertTrue(vector.getValueCapacity() >= 10); + assertTrue(vector.getDataVector().getValueCapacity() >= 100); + } + } + private void writeIntValues(UnionLargeListWriter writer, int[] values) { writer.startList(); for (int v: values) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index f0f19058eef20..2a1228c2a38c2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -1115,6 +1115,26 @@ public void testIsEmpty() { } } + @Test + public void testTotalCapacity() { + final FieldType type = FieldType.nullable(MinorType.INT.getType()); + try (final ListVector vector = new ListVector("list", allocator, type, null)) { + // Force the child vector to be allocated based on the type + // (this is a bad API: we have to track and repeat the type twice) + vector.addOrGetVector(type); + + // Specify the allocation size but do not actually allocate + vector.setInitialTotalCapacity(10, 100); + + // Finally actually do the allocation + vector.allocateNewSafe(); + + // Note: allocator rounds up and can be greater than the requested allocation. + assertTrue(vector.getValueCapacity() >= 10); + assertTrue(vector.getDataVector().getValueCapacity() >= 100); + } + } + private void writeIntValues(UnionListWriter writer, int[] values) { writer.startList(); for (int v: values) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java index b4c30480000c8..552d5752f236f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestStructVector.java @@ -282,4 +282,12 @@ public void testAddChildVectorsWithDuplicatedFieldNamesForConflictPolicyReplace( } } + @Test + public void testTypedGetters() { + try (final StructVector s1 = StructVector.empty("s1", allocator)) { + s1.addOrGet("struct_child", FieldType.nullable(MinorType.INT.getType()), IntVector.class); + assertEquals(IntVector.class, s1.getChild("struct_child", IntVector.class).getClass()); + assertEquals(IntVector.class, s1.getVectorById(0, IntVector.class).getClass()); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestSimpleWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestSimpleWriter.java index 7c06509b23c87..ef918b13fb691 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestSimpleWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestSimpleWriter.java @@ -22,9 +22,14 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.impl.LargeVarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.LargeVarCharWriterImpl; import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; +import org.apache.arrow.vector.util.Text; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -45,9 +50,9 @@ public void terminate() throws Exception { } @Test - public void testWriteByteArrayToVarBinary() { + public void testWriteByteArrayToVarBinary() throws Exception { try (VarBinaryVector vector = new VarBinaryVector("test", allocator); - VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) { + VarBinaryWriter writer = new VarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; writer.writeToVarBinary(input); byte[] result = vector.get(0); @@ -56,9 +61,9 @@ public void testWriteByteArrayToVarBinary() { } @Test - public void testWriteByteArrayWithOffsetToVarBinary() { + public void testWriteByteArrayWithOffsetToVarBinary() throws Exception { try (VarBinaryVector vector = new VarBinaryVector("test", allocator); - VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) { + VarBinaryWriter writer = new VarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; writer.writeToVarBinary(input, 1, 1); byte[] result = vector.get(0); @@ -67,9 +72,9 @@ public void testWriteByteArrayWithOffsetToVarBinary() { } @Test - public void testWriteByteBufferToVarBinary() { + public void testWriteByteBufferToVarBinary() throws Exception { try (VarBinaryVector vector = new VarBinaryVector("test", allocator); - VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) { + VarBinaryWriter writer = new VarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; ByteBuffer buffer = ByteBuffer.wrap(input); writer.writeToVarBinary(buffer); @@ -79,9 +84,9 @@ public void testWriteByteBufferToVarBinary() { } @Test - public void testWriteByteBufferWithOffsetToVarBinary() { + public void testWriteByteBufferWithOffsetToVarBinary() throws Exception { try (VarBinaryVector vector = new VarBinaryVector("test", allocator); - VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) { + VarBinaryWriter writer = new VarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; ByteBuffer buffer = ByteBuffer.wrap(input); writer.writeToVarBinary(buffer, 1, 1); @@ -91,9 +96,9 @@ public void testWriteByteBufferWithOffsetToVarBinary() { } @Test - public void testWriteByteArrayToLargeVarBinary() { + public void testWriteByteArrayToLargeVarBinary() throws Exception { try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator); - LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) { + LargeVarBinaryWriter writer = new LargeVarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; writer.writeToLargeVarBinary(input); byte[] result = vector.get(0); @@ -102,9 +107,9 @@ public void testWriteByteArrayToLargeVarBinary() { } @Test - public void testWriteByteArrayWithOffsetToLargeVarBinary() { + public void testWriteByteArrayWithOffsetToLargeVarBinary() throws Exception { try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator); - LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) { + LargeVarBinaryWriter writer = new LargeVarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; writer.writeToLargeVarBinary(input, 1, 1); byte[] result = vector.get(0); @@ -113,9 +118,9 @@ public void testWriteByteArrayWithOffsetToLargeVarBinary() { } @Test - public void testWriteByteBufferToLargeVarBinary() { + public void testWriteByteBufferToLargeVarBinary() throws Exception { try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator); - LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) { + LargeVarBinaryWriter writer = new LargeVarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; ByteBuffer buffer = ByteBuffer.wrap(input); writer.writeToLargeVarBinary(buffer); @@ -125,9 +130,9 @@ public void testWriteByteBufferToLargeVarBinary() { } @Test - public void testWriteByteBufferWithOffsetToLargeVarBinary() { + public void testWriteByteBufferWithOffsetToLargeVarBinary() throws Exception { try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator); - LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) { + LargeVarBinaryWriter writer = new LargeVarBinaryWriterImpl(vector)) { byte[] input = new byte[] { 0x01, 0x02 }; ByteBuffer buffer = ByteBuffer.wrap(input); writer.writeToLargeVarBinary(buffer, 1, 1); @@ -135,4 +140,48 @@ public void testWriteByteBufferWithOffsetToLargeVarBinary() { Assert.assertArrayEquals(new byte[] { 0x02 }, result); } } + + @Test + public void testWriteStringToVarChar() throws Exception { + try (VarCharVector vector = new VarCharVector("test", allocator); + VarCharWriter writer = new VarCharWriterImpl(vector)) { + String input = "testInput"; + writer.writeVarChar(input); + String result = vector.getObject(0).toString(); + Assert.assertEquals(input, result); + } + } + + @Test + public void testWriteTextToVarChar() throws Exception { + try (VarCharVector vector = new VarCharVector("test", allocator); + VarCharWriter writer = new VarCharWriterImpl(vector)) { + String input = "testInput"; + writer.writeVarChar(new Text(input)); + String result = vector.getObject(0).toString(); + Assert.assertEquals(input, result); + } + } + + @Test + public void testWriteStringToLargeVarChar() throws Exception { + try (LargeVarCharVector vector = new LargeVarCharVector("test", allocator); + LargeVarCharWriter writer = new LargeVarCharWriterImpl(vector)) { + String input = "testInput"; + writer.writeLargeVarChar(input); + String result = vector.getObject(0).toString(); + Assert.assertEquals(input, result); + } + } + + @Test + public void testWriteTextToLargeVarChar() throws Exception { + try (LargeVarCharVector vector = new LargeVarCharVector("test", allocator); + LargeVarCharWriter writer = new LargeVarCharWriterImpl(vector)) { + String input = "testInput"; + writer.writeLargeVarChar(new Text(input)); + String result = vector.getObject(0).toString(); + Assert.assertEquals(input, result); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index 25d26623d5c05..ab36ea2fd2129 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -21,11 +21,14 @@ import static junit.framework.TestCase.assertTrue; import static org.junit.jupiter.api.Assertions.assertThrows; +import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collections; import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.Float4Vector; @@ -63,7 +66,8 @@ public class TestVectorAppender { @Before public void prepare() { - allocator = new RootAllocator(1024 * 1024); + // Permit allocating 4 vectors of max size. + allocator = new RootAllocator(4 * BaseValueVector.MAX_ALLOCATION_SIZE); } @After @@ -185,6 +189,27 @@ public void testAppendEmptyVariableWidthVector() { } } + @Test + public void testAppendLargeAndSmallVariableVectorsWithinLimit() { + int sixteenthOfMaxAllocation = Math.toIntExact(BaseValueVector.MAX_ALLOCATION_SIZE / 16); + try (VarCharVector target = makeVarCharVec(1, sixteenthOfMaxAllocation); + VarCharVector delta = makeVarCharVec(sixteenthOfMaxAllocation, 1)) { + new VectorAppender(delta).visit(target, null); + new VectorAppender(target).visit(delta, null); + } + } + + private VarCharVector makeVarCharVec(int numElements, int bytesPerElement) { + VarCharVector v = new VarCharVector("text", allocator); + v.allocateNew((long) numElements * bytesPerElement, numElements); + for (int i = 0; i < numElements; i++) { + String s = String.join("", Collections.nCopies(bytesPerElement, "a")); + v.setSafe(i, s.getBytes(StandardCharsets.US_ASCII)); + } + v.setValueCount(numElements); + return v; + } + @Test public void testAppendLargeVariableWidthVector() { final int length1 = 5; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java index ab0ee3a2075a3..6309d385870c9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java @@ -50,7 +50,7 @@ public void shutdown() { } @Test - public void testVectorScehmaRootAppend() { + public void testVectorSchemaRootAppend() { final int length1 = 5; final int length2 = 3; final int length3 = 2; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java index 2354b281ed41d..20492036dab99 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java @@ -251,6 +251,20 @@ public void testDenseUnionVector() { } } + @Test + public void testBaseFixedWidthVectorInstanceMethod() { + try (final IntVector vector = new IntVector("v", allocator)) { + vector.validate(); + setVector(vector, 1, 2, 3); + vector.validate(); + + vector.getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + () -> vector.validate()); + assertTrue(e.getMessage().contains("Not enough capacity for fixed width data buffer")); + } + } + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { writer.start(); writer.integer("f0").writeInt(value1); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java index 4241a0d9cff93..ca71a622bb8ea 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java @@ -23,11 +23,14 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.LargeVarCharVector; @@ -231,4 +234,75 @@ public void testDenseUnionVector() { assertTrue(e.getMessage().contains("Dense union vector offset exceeds sub-vector boundary")); } } + + @Test + public void testBaseVariableWidthVectorInstanceMethod() { + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + vector.validateFull(); + setVector(vector, "aaa", "bbb", "ccc"); + vector.validateFull(); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(4, 50); + + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + vector::validateFull); + assertTrue(e.getMessage().contains("The values in positions 0 and 1 of the offset buffer are decreasing")); + } + } + + @Test + public void testValidateVarCharUTF8() { + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + vector.validateFull(); + setVector(vector, "aaa".getBytes(StandardCharsets.UTF_8), "bbb".getBytes(StandardCharsets.UTF_8), + new byte[] {(byte) 0xFF, (byte) 0xFE}); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + vector::validateFull); + assertTrue(e.getMessage().contains("UTF")); + } + } + + @Test + public void testValidateLargeVarCharUTF8() { + try (final LargeVarCharVector vector = new LargeVarCharVector("v", allocator)) { + vector.validateFull(); + setVector(vector, "aaa".getBytes(StandardCharsets.UTF_8), "bbb".getBytes(StandardCharsets.UTF_8), + new byte[] {(byte) 0xFF, (byte) 0xFE}); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + vector::validateFull); + assertTrue(e.getMessage().contains("UTF")); + } + } + + @Test + public void testValidateDecimal() { + try (final DecimalVector vector = new DecimalVector(Field.nullable("v", + new ArrowType.Decimal(2, 0, DecimalVector.TYPE_WIDTH * 8)), allocator)) { + vector.validateFull(); + setVector(vector, 1L); + vector.validateFull(); + vector.clear(); + setVector(vector, Long.MAX_VALUE); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + vector::validateFull); + assertTrue(e.getMessage().contains("Decimal")); + } + } + + @Test + public void testValidateDecimal256() { + try (final Decimal256Vector vector = new Decimal256Vector(Field.nullable("v", + new ArrowType.Decimal(2, 0, DecimalVector.TYPE_WIDTH * 8)), allocator)) { + vector.validateFull(); + setVector(vector, 1L); + vector.validateFull(); + vector.clear(); + setVector(vector, Long.MAX_VALUE); + ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class, + vector::validateFull); + assertTrue(e.getMessage().contains("Decimal")); + } + } } diff --git a/js/package.json b/js/package.json index 1ee0e11bca5b9..14f26c74d29f3 100644 --- a/js/package.json +++ b/js/package.json @@ -35,7 +35,7 @@ "author": "Apache Software Foundation", "license": "Apache-2.0", "bugs": { - "url": "https://issues.apache.org/jira/projects/ARROW" + "url": "https://github.com/apache/arrow/issues" }, "homepage": "https://github.com/apache/arrow/blob/main/js/README.md", "files": [ @@ -77,16 +77,16 @@ "async-done": "2.0.0", "benny": "3.7.1", "cross-env": "7.0.3", - "del": "7.0.0", + "del": "7.1.0", "del-cli": "5.1.0", "esbuild": "0.19.2", "esbuild-plugin-alias": "0.2.1", "eslint": "8.42.0", - "eslint-plugin-jest": "27.2.3", + "eslint-plugin-jest": "27.4.2", "eslint-plugin-unicorn": "47.0.0", "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", "glob": "10.2.7", - "google-closure-compiler": "20230502.0.0", + "google-closure-compiler": "20230802.0.0", "gulp": "4.0.2", "gulp-esbuild": "0.11.1", "gulp-json-transform": "0.4.8", @@ -99,7 +99,7 @@ "ix": "5.0.0", "jest": "29.6.2", "jest-silent-reporter": "0.5.0", - "memfs": "4.2.1", + "memfs": "4.5.0", "mkdirp": "3.0.1", "multistream": "4.1.0", "randomatic": "3.1.1", diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 2fdef60c1fb55..451bf6acb6186 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -59,6 +59,7 @@ export { Union, DenseUnion, SparseUnion, Dictionary, Interval, IntervalDayTime, IntervalYearMonth, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, FixedSizeList, Map_, MapRow, Table, makeTable, tableFromArrays, @@ -86,6 +87,7 @@ export { FixedSizeListBuilder, FloatBuilder, Float16Builder, Float32Builder, Float64Builder, IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, + DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder, IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, ListBuilder, MapBuilder, diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 4a6394c266b1b..714861e764ccb 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -48,6 +48,7 @@ export { Union, DenseUnion, SparseUnion, Dictionary, Interval, IntervalDayTime, IntervalYearMonth, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, FixedSizeList, Map_ } from './type.js'; @@ -75,6 +76,7 @@ export { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint export { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from './builder/time.js'; export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js'; export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; +export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { ListBuilder } from './builder/list.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 90fe3ddcc9477..93510eedf84ff 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -21,7 +21,7 @@ import { MapRow, kKeys } from './row/map.js'; import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, - Date_, Time, Timestamp, Interval, + Date_, Time, Timestamp, Interval, Duration, Utf8, Binary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; @@ -290,7 +290,7 @@ export abstract class Builder { } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists // Binary, Utf8 data = _values?.flush(_offsets.last()); - } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) + } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); } @@ -342,7 +342,7 @@ export abstract class Builder { (Builder.prototype as any)._isValid = () => true; /** @ignore */ -export abstract class FixedWidthBuilder extends Builder { +export abstract class FixedWidthBuilder extends Builder { constructor(opts: BuilderOptions) { super(opts); this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); diff --git a/js/src/builder/duration.ts b/js/src/builder/duration.ts new file mode 100644 index 0000000000000..968899ea55b91 --- /dev/null +++ b/js/src/builder/duration.ts @@ -0,0 +1,46 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedWidthBuilder } from '../builder.js'; +import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond } from '../type.js'; +import { setDuration, setDurationSecond, setDurationMillisecond, setDurationMicrosecond, setDurationNanosecond } from '../visitor/set.js'; + +/** @ignore */ +export class DurationBuilder extends FixedWidthBuilder { } + +(DurationBuilder.prototype as any)._setValue = setDuration; + +/** @ignore */ +export class DurationSecondBuilder extends DurationBuilder { } + +(DurationSecondBuilder.prototype as any)._setValue = setDurationSecond; + +/** @ignore */ +export class DurationMillisecondBuilder extends DurationBuilder { } + +(DurationMillisecondBuilder.prototype as any)._setValue = setDurationMillisecond; + +/** @ignore */ +export class DurationMicrosecondBuilder extends DurationBuilder { } + +(DurationMicrosecondBuilder.prototype as any)._setValue = setDurationMicrosecond; + +/** @ignore */ +export class DurationNanosecondBuilder extends DurationBuilder { } + +(DurationNanosecondBuilder.prototype as any)._setValue = setDurationNanosecond; diff --git a/js/src/data.ts b/js/src/data.ts index dc423cdb01e1c..1e9df71cff8a7 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -257,6 +257,7 @@ import { Int, Date_, Interval, + Duration, Time, Timestamp, Union, DenseUnion, SparseUnion, @@ -390,6 +391,13 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = data.length / strideForType(type), ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0, } = props; return new Data(type, offset, length, nullCount, [undefined, data, nullBitmap]); } + public visitDuration(props: DurationDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const data = toArrayBufferView(type.ArrayType, props['data']); + const { ['length']: length = data.length, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0, } = props; + return new Data(type, offset, length, nullCount, [undefined, data, nullBitmap]); + } public visitFixedSizeList(props: FixedSizeListDataProps) { const { ['type']: type, ['offset']: offset = 0, ['child']: child = new MakeDataVisitor().visit({ type: type.valueType }) } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -424,6 +432,7 @@ interface Date_DataProps extends DataProps_ { data?: DataBuf interface TimeDataProps extends DataProps_ { data?: DataBuffer } interface TimestampDataProps extends DataProps_ { data?: DataBuffer } interface IntervalDataProps extends DataProps_ { data?: DataBuffer } +interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } @@ -446,6 +455,7 @@ export type DataProps = ( T extends Time /* */ ? TimeDataProps : T extends Timestamp /* */ ? TimestampDataProps : T extends Interval /* */ ? IntervalDataProps : + T extends Duration /* */ ? DurationDataProps : T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : @@ -471,6 +481,7 @@ export function makeData(props: Date_DataProps): Data; export function makeData(props: TimeDataProps): Data; export function makeData(props: TimestampDataProps): Data; export function makeData(props: IntervalDataProps): Data; +export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; diff --git a/js/src/enum.ts b/js/src/enum.ts index f5856bc06afbe..4e207dd37cec1 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,7 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only enum values 0-17 (NONE through Map) are written to an Arrow + * **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow * IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type @@ -174,6 +174,7 @@ export enum Type { FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */ FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ + Duration = 18, /** Measure of elapsed time in either seconds, miliseconds, microseconds or nanoseconds. */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -201,6 +202,10 @@ export enum Type { SparseUnion = -24, IntervalDayTime = -25, IntervalYearMonth = -26, + DurationSecond = -27, + DurationMillisecond = -28, + DurationMicrosecond = -29, + DurationNanosecond = -30 } export enum BufferType { diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 8d61295919046..95c5adbb2a25e 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -31,6 +31,7 @@ import type { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, import type { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from './builder/time.js'; import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js'; import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; +import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; import type { Utf8Builder } from './builder/utf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { ListBuilder } from './builder/list.js'; @@ -222,6 +223,11 @@ export type TypeToDataType = { [Type.Interval]: type.Interval; [Type.IntervalDayTime]: type.IntervalDayTime; [Type.IntervalYearMonth]: type.IntervalYearMonth; + [Type.Duration]: type.Duration; + [Type.DurationSecond]: type.DurationSecond; + [Type.DurationMillisecond]: type.DurationMillisecond; + [Type.DurationMicrosecond]: type.DurationMicrosecond; + [Type.DurationNanosecond]: type.DurationNanosecond; [Type.Map]: type.Map_; [Type.List]: type.List; [Type.Struct]: type.Struct; @@ -270,6 +276,11 @@ type TypeToBuilder = { [Type.Interval]: IntervalBuilder; [Type.IntervalDayTime]: IntervalDayTimeBuilder; [Type.IntervalYearMonth]: IntervalYearMonthBuilder; + [Type.Duration]: DurationBuilder; + [Type.DurationSecond]: DurationBuilder; + [Type.DurationMillisecond]: DurationMillisecondBuilder; + [Type.DurationMicrosecond]: DurationMicrosecondBuilder; + [Type.DurationNanosecond]: DurationNanosecondBuilder; [Type.Map]: MapBuilder; [Type.List]: ListBuilder; [Type.Struct]: StructBuilder; @@ -318,6 +329,11 @@ type DataTypeToBuilder = { [Type.Interval]: T extends type.Interval ? IntervalBuilder : never; [Type.IntervalDayTime]: T extends type.IntervalDayTime ? IntervalDayTimeBuilder : never; [Type.IntervalYearMonth]: T extends type.IntervalYearMonth ? IntervalYearMonthBuilder : never; + [Type.Duration]: T extends type.Duration ? DurationBuilder: never; + [Type.DurationSecond]: T extends type.DurationSecond ? DurationSecondBuilder : never; + [Type.DurationMillisecond]: T extends type.DurationMillisecond ? DurationMillisecondBuilder : never; + [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder: never; + [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder: never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index e5995110f084b..f1f306730ddba 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -22,7 +22,7 @@ import { DataType, Dictionary, TimeBitWidth, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, - Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; import { DictionaryBatch, RecordBatch, FieldNode, BufferRegion } from './message.js'; @@ -185,6 +185,10 @@ function typeFromJSON(f: any, children?: Field[]): DataType { const t = f['type']; return new Interval(IntervalUnit[t['unit']] as any); } + case 'duration': { + const t = f['type']; + return new Duration(TimeUnit[t['unit']] as any); + } case 'union': { const t = f['type']; const [m, ...ms] = (t['mode'] + '').toLowerCase(); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index 6465d3d064720..27c9b92d6897b 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -36,6 +36,7 @@ import { Date as _Date } from '../../fb/date.js'; import { Time as _Time } from '../../fb/time.js'; import { Timestamp as _Timestamp } from '../../fb/timestamp.js'; import { Interval as _Interval } from '../../fb/interval.js'; +import { Duration as _Duration } from '../../fb/duration.js'; import { Union as _Union } from '../../fb/union.js'; import { FixedSizeBinary as _FixedSizeBinary } from '../../fb/fixed-size-binary.js'; import { FixedSizeList as _FixedSizeList } from '../../fb/fixed-size-list.js'; @@ -57,7 +58,7 @@ import { DataType, Dictionary, TimeBitWidth, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, - Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; /** @@ -466,6 +467,10 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { const t = f.type(new _Interval())!; return new Interval(t.unit()); } + case Type['Duration']: { + const t = f.type(new _Duration())!; + return new Duration(t.unit()); + } case Type['Union']: { const t = f.type(new _Union())!; return new Union(t.mode(), t.typeIdsArray() || [], children || []); diff --git a/js/src/type.ts b/js/src/type.ts index 1dc90c47cbd10..34bbf45bca728 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -63,6 +63,7 @@ export abstract class DataType { construct /** @ignore */ export class IntervalYearMonth extends Interval_ { constructor() { super(IntervalUnit.YEAR_MONTH); } } +/** @ignore */ +type Durations = Type.Duration | Type.DurationSecond | Type.DurationMillisecond | Type.DurationMicrosecond | Type.DurationNanosecond; +/** @ignore */ +export interface Duration extends DataType { + TArray: BigInt64Array; + TValue: bigint; + ArrayType: BigInt64Array; +} + +/** @ignore */ +export class Duration extends DataType { + constructor(public readonly unit: TimeUnit) { + super(); + } + public get typeId() { return Type.Duration as T; } + public toString() { return `Duration<${TimeUnit[this.unit]}>`; } + protected static [Symbol.toStringTag] = ((proto: Duration) => { + (proto).unit = null; + (proto).ArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'Duration'; + })(Duration.prototype); +} + +/** @ignore */ +export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); }} +/** @ignore */ +export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); }} +/** @ignore */ +export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); }} +/** @ignore */ +export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); }} + + /** @ignore */ export interface List extends DataType { TArray: Array; diff --git a/js/src/visitor.ts b/js/src/visitor.ts index 3be50a6d3eacf..c63640b038e47 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -16,7 +16,7 @@ // under the License. import { Type, Precision, DateUnit, TimeUnit, IntervalUnit, UnionMode } from './enum.js'; -import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, } from './type.js'; +import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration } from './type.js'; export abstract class Visitor { public visitMany(nodes: any[], ...args: any[][]) { @@ -47,6 +47,7 @@ export abstract class Visitor { public visitUnion(_node: any, ..._args: any[]): any { return null; } public visitDictionary(_node: any, ..._args: any[]): any { return null; } public visitInterval(_node: any, ..._args: any[]): any { return null; } + public visitDuration(_node: any, ... _args: any[]): any { return null; } public visitFixedSizeList(_node: any, ..._args: any[]): any { return null; } public visitMap(_node: any, ..._args: any[]): any { return null; } } @@ -113,6 +114,11 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Interval: fn = visitor.visitInterval; break; case Type.IntervalDayTime: fn = visitor.visitIntervalDayTime || visitor.visitInterval; break; case Type.IntervalYearMonth: fn = visitor.visitIntervalYearMonth || visitor.visitInterval; break; + case Type.Duration: fn = visitor.visitDuration; break; + case Type.DurationSecond: fn = visitor.visitDurationSecond || visitor.visitDuration; break; + case Type.DurationMillisecond: fn = visitor.visitDurationMillisecond || visitor.visitDuration; break; + case Type.DurationMicrosecond: fn = visitor.visitDurationMicrosecond || visitor.visitDuration; break; + case Type.DurationNanosecond: fn = visitor.visitDurationNanosecond || visitor.visitDuration; break; case Type.FixedSizeList: fn = visitor.visitFixedSizeList; break; case Type.Map: fn = visitor.visitMap; break; } @@ -180,6 +186,15 @@ function inferDType(type: T): Type { } // @ts-ignore return Type.Interval; + case Type.Duration: + switch ((type as any as Duration).unit) { + case TimeUnit.SECOND: return Type.DurationSecond; + case TimeUnit.MILLISECOND: return Type.DurationMillisecond; + case TimeUnit.MICROSECOND: return Type.DurationMicrosecond; + case TimeUnit.NANOSECOND: return Type.DurationNanosecond; + } + // @ts-ignore + return Type.Duration; case Type.Map: return Type.Map; case Type.List: return Type.List; case Type.Struct: return Type.Struct; @@ -239,6 +254,11 @@ export interface Visitor { visitInterval(node: any, ...args: any[]): any; visitIntervalDayTime?(node: any, ...args: any[]): any; visitIntervalYearMonth?(node: any, ...args: any[]): any; + visitDuration(node: any, ...args: any[]): any; + visitDurationSecond(node: any, ...args: any[]): any; + visitDurationMillisecond(node: any, ...args: any[]): any; + visitDurationMicrosecond(node: any, ...args: any[]): any; + visitDurationNanosecond(node: any, ...args: any[]): any; visitFixedSizeList(node: any, ...args: any[]): any; visitMap(node: any, ...args: any[]): any; } @@ -270,3 +290,8 @@ export interface Visitor { (Visitor.prototype as any).visitSparseUnion = null; (Visitor.prototype as any).visitIntervalDayTime = null; (Visitor.prototype as any).visitIntervalYearMonth = null; +(Visitor.prototype as any).visitDuration = null; +(Visitor.prototype as any).visitDurationSecond = null; +(Visitor.prototype as any).visitDurationMillisecond = null; +(Visitor.prototype as any).visitDurationMicrosecond = null; +(Visitor.prototype as any).visitDurationNanosecond = null; diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 9ce9ae4d4a797..2d20f2a8efd5c 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -30,6 +30,7 @@ import { FixedSizeBinaryBuilder } from '../builder/fixedsizebinary.js'; import { FixedSizeListBuilder } from '../builder/fixedsizelist.js'; import { FloatBuilder, Float16Builder, Float32Builder, Float64Builder } from '../builder/float.js'; import { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from '../builder/interval.js'; +import { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from '../builder/duration.js'; import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int.js'; import { ListBuilder } from '../builder/list.js'; import { MapBuilder } from '../builder/map.js'; @@ -91,6 +92,11 @@ export class GetBuilderCtor extends Visitor { public visitInterval() { return IntervalBuilder; } public visitIntervalDayTime() { return IntervalDayTimeBuilder; } public visitIntervalYearMonth() { return IntervalYearMonthBuilder; } + public visitDuration() { return DurationBuilder; } + public visitDurationSecond() { return DurationSecondBuilder; } + public visitDurationMillisecond() { return DurationMillisecondBuilder; } + public visitDurationMicrosecond() { return DurationMicrosecondBuilder; } + public visistDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index 862808ad54ee9..72d6148a52fd8 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -25,7 +25,7 @@ import { TypeToDataType } from '../interfaces.js'; import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, - Float, Int, Date_, Interval, Time, Timestamp, + Float, Int, Date_, Interval, Time, Timestamp, Duration, Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -75,6 +75,9 @@ export class GetByteLengthVisitor extends Visitor { public visitInterval(data: Data, _: number) { return (data.type.unit + 1) * 4; } + public visitDuration(____: Data, _: number) { + return 8; + } public visitStruct(data: Data, i: number) { return data.children.reduce((total, child) => total + instance.visit(child, i), 0); } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 12f8325470bac..5aaaedf51a37e 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -34,6 +34,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -84,6 +85,11 @@ export interface GetVisitor extends Visitor { visitInterval(data: Data, index: number): T['TValue'] | null; visitIntervalDayTime(data: Data, index: number): T['TValue'] | null; visitIntervalYearMonth(data: Data, index: number): T['TValue'] | null; + visitDuration(data: Data, index: number): T['TValue'] | null; + visitDurationSecond(data: Data, index: number): T['TValue'] | null; + visitDurationMillisecond(data: Data, index: number): T['TValue'] | null; + visitDurationMicrosecond(data: Data, index: number): T['TValue'] | null; + visitDurationNanosecond(data: Data, index: number): T['TValue'] | null; visitFixedSizeList(data: Data, index: number): T['TValue'] | null; visitMap(data: Data, index: number): T['TValue'] | null; } @@ -279,6 +285,25 @@ const getIntervalYearMonth = ({ values }: Data, return int32s; }; +/** @ignore */ +const getDurationSecond = ({ values }: Data, index: number): T['TValue'] => values[index]; +/** @ignore */ +const getDurationMillisecond = ({ values }: Data, index: number): T['TValue'] => values[index]; +/** @ignore */ +const getDurationMicrosecond = ({ values }: Data, index: number): T['TValue'] => values[index]; +/** @ignore */ +const getDurationNanosecond = ({ values }: Data, index: number): T['TValue'] => values[index]; +/* istanbul ignore next */ +/** @ignore */ +const getDuration = (data: Data, index: number): T['TValue'] => { + switch (data.type.unit) { + case TimeUnit.SECOND: return getDurationSecond(data as Data, index); + case TimeUnit.MILLISECOND: return getDurationMillisecond(data as Data, index); + case TimeUnit.MICROSECOND: return getDurationMicrosecond(data as Data, index); + case TimeUnit.NANOSECOND: return getDurationNanosecond(data as Data, index); + } +}; + /** @ignore */ const getFixedSizeList = (data: Data, index: number): T['TValue'] => { const { stride, children } = data; @@ -328,6 +353,11 @@ GetVisitor.prototype.visitDictionary = wrapGet(getDictionary); GetVisitor.prototype.visitInterval = wrapGet(getInterval); GetVisitor.prototype.visitIntervalDayTime = wrapGet(getIntervalDayTime); GetVisitor.prototype.visitIntervalYearMonth = wrapGet(getIntervalYearMonth); +GetVisitor.prototype.visitDuration = wrapGet(getDuration); +GetVisitor.prototype.visitDurationSecond = wrapGet(getDurationSecond); +GetVisitor.prototype.visitDurationMillisecond = wrapGet(getDurationMillisecond); +GetVisitor.prototype.visitDurationMicrosecond = wrapGet(getDurationMicrosecond); +GetVisitor.prototype.visitDurationNanosecond = wrapGet(getDurationNanosecond); GetVisitor.prototype.visitFixedSizeList = wrapGet(getFixedSizeList); GetVisitor.prototype.visitMap = wrapGet(getMap); diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 654134c6dff04..28dcff20d3bd3 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -31,6 +31,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -81,6 +82,11 @@ export interface IndexOfVisitor extends Visitor { visitInterval(data: Data, value: T['TValue'] | null, index?: number): number; visitIntervalDayTime(data: Data, value: T['TValue'] | null, index?: number): number; visitIntervalYearMonth(data: Data, value: T['TValue'] | null, index?: number): number; + visitDuration(data: Data, value: T['TValue'] | null, index?: number): number; + visitDurationSecond(data: Data, value: T['TValue'] | null, index?: number): number; + visitDurationMillisecond(data: Data, value: T['TValue'] | null, index?: number): number; + visitDurationMicrosecond(data: Data, value: T['TValue'] | null, index?: number): number; + visitDurationNanosecond(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeList(data: Data, value: T['TValue'] | null, index?: number): number; visitMap(data: Data, value: T['TValue'] | null, index?: number): number; } @@ -191,6 +197,11 @@ IndexOfVisitor.prototype.visitDictionary = indexOfValue; IndexOfVisitor.prototype.visitInterval = indexOfValue; IndexOfVisitor.prototype.visitIntervalDayTime = indexOfValue; IndexOfVisitor.prototype.visitIntervalYearMonth = indexOfValue; +IndexOfVisitor.prototype.visitDuration = indexOfValue; +IndexOfVisitor.prototype.visitDurationSecond = indexOfValue; +IndexOfVisitor.prototype.visitDurationMillisecond = indexOfValue; +IndexOfVisitor.prototype.visitDurationMicrosecond = indexOfValue; +IndexOfVisitor.prototype.visitDurationNanosecond = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeList = indexOfValue; IndexOfVisitor.prototype.visitMap = indexOfValue; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index 48021a78e86f6..e38bb907695d0 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -28,6 +28,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from '../type.js'; import { ChunkedIterator } from '../util/chunk.js'; @@ -79,6 +80,11 @@ export interface IteratorVisitor extends Visitor { visitInterval(vector: Vector): IterableIterator; visitIntervalDayTime(vector: Vector): IterableIterator; visitIntervalYearMonth(vector: Vector): IterableIterator; + visitDuration(vector: Vector): IterableIterator; + visitDurationSecond(vector: Vector): IterableIterator; + visitDurationMillisecond(vector: Vector): IterableIterator; + visitDurationMicrosecond(vector: Vector): IterableIterator; + visitDurationNanosecond(vector: Vector): IterableIterator; visitFixedSizeList(vector: Vector): IterableIterator; visitMap(vector: Vector): IterableIterator; } @@ -177,6 +183,11 @@ IteratorVisitor.prototype.visitDictionary = vectorIterator; IteratorVisitor.prototype.visitInterval = vectorIterator; IteratorVisitor.prototype.visitIntervalDayTime = vectorIterator; IteratorVisitor.prototype.visitIntervalYearMonth = vectorIterator; +IteratorVisitor.prototype.visitDuration = vectorIterator; +IteratorVisitor.prototype.visitDurationSecond = vectorIterator; +IteratorVisitor.prototype.visitDurationMillisecond = vectorIterator; +IteratorVisitor.prototype.visitDurationMicrosecond = vectorIterator; +IteratorVisitor.prototype.visitDurationNanosecond = vectorIterator; IteratorVisitor.prototype.visitFixedSizeList = vectorIterator; IteratorVisitor.prototype.visitMap = vectorIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index d83edfc24fbd8..6e6cfb07413c3 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -63,6 +63,9 @@ export class JSONTypeAssembler extends Visitor { public visitInterval({ typeId, unit }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; } + public visitDuration({ typeId, unit }: T) { + return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit]}; + } public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 7a617f4afe2c4..55a6b4e2ea390 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -26,7 +26,7 @@ import { UnionMode, DateUnit, TimeUnit } from '../enum.js'; import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, - Float, Int, Date_, Interval, Time, Timestamp, Union, + Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, } from '../type.js'; @@ -52,6 +52,7 @@ export interface JSONVectorAssembler extends Visitor { visitStruct(data: Data): { children: any[] }; visitUnion(data: Data): { children: any[]; TYPE_ID: number[] }; visitInterval(data: Data): { DATA: number[] }; + visitDuration(data: Data): { DATA: string[] }; visitFixedSizeList(data: Data): { children: any[] }; visitMap(data: Data): { children: any[] }; } @@ -146,6 +147,9 @@ export class JSONVectorAssembler extends Visitor { public visitInterval(data: Data) { return { 'DATA': [...data.values] }; } + public visitDuration(data: Data) { + return { 'DATA': [...bigNumsToStrings(data.values, 2)]}; + } public visitFixedSizeList(data: Data) { return { 'children': this.visitMany(data.type.children, data.children) diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index c2d4319911afe..1a0eddc556899 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -32,6 +32,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -82,6 +83,11 @@ export interface SetVisitor extends Visitor { visitInterval(data: Data, index: number, value: T['TValue']): void; visitIntervalDayTime(data: Data, index: number, value: T['TValue']): void; visitIntervalYearMonth(data: Data, index: number, value: T['TValue']): void; + visitDuration(data: Data, index: number, value: T['TValue']): void; + visitDurationSecond(data: Data, index: number, value: T['TValue']): void; + visitDurationMillisecond(data: Data, index: number, value: T['TValue']): void; + visitDurationMicrosecond(data: Data, index: number, value: T['TValue']): void; + visitDurationNanosecond(data: Data, index: number, value: T['TValue']): void; visitFixedSizeList(data: Data, index: number, value: T['TValue']): void; visitMap(data: Data, index: number, value: T['TValue']): void; } @@ -308,6 +314,26 @@ export const setIntervalDayTime = ({ values }: Data({ values }: Data, index: number, value: T['TValue']): void => { values[index] = (value[0] * 12) + (value[1] % 12); }; +/** @ignore */ +export const setDurationSecond = ({ values }: Data, index: number, value: T['TValue']): void => { values[index] = value; }; +/** @ignore */ +export const setDurationMillisecond = ({ values }: Data, index: number, value: T['TValue']): void => { values[index] = value; }; +/** @ignore */ +export const setDurationMicrosecond = ({ values }: Data, index: number, value: T['TValue']): void => { values[index] = value; }; +/** @ignore */ +export const setDurationNanosecond = ({ values }: Data, index: number, value: T['TValue']): void => { values[index] = value; }; +/* istanbul ignore next */ +/** @ignore */ +export const setDuration = (data: Data, index: number, value: T['TValue']): void => { + switch (data.type.unit) { + case TimeUnit.SECOND: return setDurationSecond(data as Data, index, value as DurationSecond['TValue']); + case TimeUnit.MILLISECOND: return setDurationMillisecond(data as Data, index, value as DurationMillisecond['TValue']); + case TimeUnit.MICROSECOND: return setDurationMicrosecond(data as Data, index, value as DurationMicrosecond['TValue']); + case TimeUnit.NANOSECOND: return setDurationNanosecond(data as Data, index, value as DurationNanosecond['TValue']); + } +}; + + /** @ignore */ const setFixedSizeList = (data: Data, index: number, value: T['TValue']): void => { const { stride } = data; @@ -364,6 +390,11 @@ SetVisitor.prototype.visitDictionary = wrapSet(setDictionary); SetVisitor.prototype.visitInterval = wrapSet(setIntervalValue); SetVisitor.prototype.visitIntervalDayTime = wrapSet(setIntervalDayTime); SetVisitor.prototype.visitIntervalYearMonth = wrapSet(setIntervalYearMonth); +SetVisitor.prototype.visitDuration = wrapSet(setDuration); +SetVisitor.prototype.visitDurationSecond = wrapSet(setDurationSecond); +SetVisitor.prototype.visitDurationMillisecond = wrapSet(setDurationMillisecond); +SetVisitor.prototype.visitDurationMicrosecond = wrapSet(setDurationMicrosecond); +SetVisitor.prototype.visitDurationNanosecond = wrapSet(setDurationNanosecond); SetVisitor.prototype.visitFixedSizeList = wrapSet(setFixedSizeList); SetVisitor.prototype.visitMap = wrapSet(setMap); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index c84e3930f64f5..c2262d20531b9 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -32,6 +32,7 @@ import { Date } from '../fb/date.js'; import { Time } from '../fb/time.js'; import { Timestamp } from '../fb/timestamp.js'; import { Interval } from '../fb/interval.js'; +import { Duration } from '../fb/duration.js'; import { List } from '../fb/list.js'; import { Struct_ as Struct } from '../fb/struct-.js'; import { Union } from '../fb/union.js'; @@ -109,6 +110,11 @@ export class TypeAssembler extends Visitor { Interval.addUnit(b, node.unit); return Interval.endInterval(b); } + public visitDuration(node: T, b: Builder) { + Duration.startDuration(b); + Duration.addUnit(b, node.unit); + return Duration.endDuration(b); + } public visitList(_node: T, b: Builder) { List.startList(b); return List.endList(b); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index a77c4020961ce..1de8e218dae4f 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -28,6 +28,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -77,6 +78,11 @@ export interface TypeComparator extends Visitor { visitInterval(type: T, other?: DataType | null): other is T; visitIntervalDayTime(type: T, other?: DataType | null): other is T; visitIntervalYearMonth(type: T, other?: DataType | null): other is T; + visitDuration(type: T, other?: DataType | null): other is T; + visitDurationSecond(type: T, other?: DataType | null): other is T; + visitDurationMillisecond(type: T, other?: DataType | null): other is T; + visitDurationMicrosecond(type: T, other?: DataType | null): other is T; + visitDurationNanosecond(type: T, other?: DataType | null): other is T; visitFixedSizeList(type: T, other?: DataType | null): other is T; visitMap(type: T, other?: DataType | null): other is T; } @@ -202,6 +208,13 @@ function compareInterval(type: T, other?: DataType | null): ); } +function compareDuration(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.unit === other.unit + ); +} + function compareFixedSizeList(type: T, other?: DataType | null): other is T { return (type === other) || ( compareConstructor(type, other) && @@ -261,6 +274,11 @@ TypeComparator.prototype.visitDictionary = compareDictionary; TypeComparator.prototype.visitInterval = compareInterval; TypeComparator.prototype.visitIntervalDayTime = compareInterval; TypeComparator.prototype.visitIntervalYearMonth = compareInterval; +TypeComparator.prototype.visitDuration = compareDuration; +TypeComparator.prototype.visitDurationSecond = compareDuration; +TypeComparator.prototype.visitDurationMillisecond = compareDuration; +TypeComparator.prototype.visitDurationMicrosecond = compareDuration; +TypeComparator.prototype.visitDurationNanosecond = compareDuration; TypeComparator.prototype.visitFixedSizeList = compareFixedSizeList; TypeComparator.prototype.visitMap = compareMap; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index c825a61dbadfb..077f66592fbfb 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -74,6 +74,11 @@ export class GetDataTypeConstructor extends Visitor { public visitInterval() { return type.Interval; } public visitIntervalDayTime() { return type.IntervalDayTime; } public visitIntervalYearMonth() { return type.IntervalYearMonth; } + public visitDuration() { return type.Duration; } + public visitDurationSecond() { return type.DurationSecond; } + public visitDurationMillisecond() { return type.DurationMillisecond; } + public visitDurationMicrosecond() { return type.DurationMicrosecond; } + public visitDurationNanosecond() { return type.DurationNanosecond; } public visitFixedSizeList() { return type.FixedSizeList; } public visitMap() { return type.Map_; } } diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index dbf778c4c3631..949463272e718 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -26,7 +26,7 @@ import { packBools, truncateBitmap } from '../util/bit.js'; import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, - Float, Int, Date_, Interval, Time, Timestamp, Union, + Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, } from '../type.js'; @@ -51,6 +51,7 @@ export interface VectorAssembler extends Visitor { visitStruct(data: Data): this; visitUnion(data: Data): this; visitInterval(data: Data): this; + visitDuration(data: Data): this; visitFixedSizeList(data: Data): this; visitMap(data: Data): this; } @@ -195,7 +196,7 @@ function assembleBoolVector(this: VectorAssembler, data: Data } /** @ignore */ -function assembleFlatVector(this: VectorAssembler, data: Data) { +function assembleFlatVector(this: VectorAssembler, data: Data) { return addBuffer.call(this, data.values.subarray(0, data.length * data.stride)); } @@ -243,5 +244,6 @@ VectorAssembler.prototype.visitList = assembleListVector; VectorAssembler.prototype.visitStruct = assembleNestedVector; VectorAssembler.prototype.visitUnion = assembleUnion; VectorAssembler.prototype.visitInterval = assembleFlatVector; +VectorAssembler.prototype.visitDuration = assembleFlatVector; VectorAssembler.prototype.visitFixedSizeList = assembleListVector; VectorAssembler.prototype.visitMap = assembleListVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index cb4bc2829274f..db34edad9a1c1 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -115,6 +115,9 @@ export class VectorLoader extends Visitor { public visitInterval(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } + public visitDuration(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); + } public visitFixedSizeList(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), 'child': this.visit(type.children[0]) }); } @@ -157,7 +160,7 @@ export class JSONVectorLoader extends VectorLoader { const { sources } = this; if (DataType.isTimestamp(type)) { return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); - } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { + } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64 || DataType.isDuration(type)) { return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index e4d859e0a69b7..28aed7e4feccf 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -30,7 +30,8 @@ const valueVectorGeneratorNames = [ 'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', - 'dictionary', 'intervalDayTime', 'intervalYearMonth' + 'dictionary', 'intervalDayTime', 'intervalYearMonth', + 'durationSecond', 'durationMillisecond', 'durationMicrosecond', 'durationNanosecond', ]; const vectorGeneratorNames = [...valueVectorGeneratorNames, ...listVectorGeneratorNames, ...nestedVectorGeneratorNames]; diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index a03b22c54c770..15fb715a31f95 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -36,6 +36,7 @@ import { Union, DenseUnion, SparseUnion, Dictionary, Interval, IntervalDayTime, IntervalYearMonth, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, FixedSizeList, Map_, DateUnit, TimeUnit, UnionMode, @@ -58,6 +59,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, dictionary?: Vector): GeneratedVector; @@ -84,6 +86,7 @@ interface TestDataVectorGenerator extends Visitor { visitUnion: typeof generateUnion; visitDictionary: typeof generateDictionary; visitInterval: typeof generateInterval; + visitDuration: typeof generateDuration; visitFixedSizeList: typeof generateFixedSizeList; visitMap: typeof generateMap; } @@ -108,6 +111,7 @@ TestDataVectorGenerator.prototype.visitStruct = generateStruct; TestDataVectorGenerator.prototype.visitUnion = generateUnion; TestDataVectorGenerator.prototype.visitDictionary = generateDictionary; TestDataVectorGenerator.prototype.visitInterval = generateInterval; +TestDataVectorGenerator.prototype.visitDuration = generateDuration; TestDataVectorGenerator.prototype.visitFixedSizeList = generateFixedSizeList; TestDataVectorGenerator.prototype.visitMap = generateMap; @@ -230,11 +234,15 @@ export const sparseUnion = (length = 100, nullCount = Math.trunc(length * 0.2), export const dictionary = (length = 100, nullCount = Math.trunc(length * 0.2), dict: T = new Utf8(), keys: TKey = new Int32()) => vectorGenerator.visit(new Dictionary(dict, keys), length, nullCount); export const intervalDayTime = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new IntervalDayTime(), length, nullCount); export const intervalYearMonth = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new IntervalYearMonth(), length, nullCount); +export const durationSecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DurationSecond(), length, nullCount); +export const durationMillisecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DurationMillisecond(), length, nullCount); +export const durationMicrosecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DurationMicrosecond(), length, nullCount); +export const durationNanosecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DurationNanosecond(), length, nullCount); export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2), listSize = 2, child = defaultListChild) => vectorGenerator.visit(new FixedSizeList(listSize, child), length, nullCount); export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -421,6 +429,16 @@ function generateInterval(this: TestDataVectorGenerator, typ return { values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) }; } +function generateDuration(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const multiple = type.unit === TimeUnit.NANOSECOND ? 1000000000 : + type.unit === TimeUnit.MICROSECOND ? 1000000 : + type.unit === TimeUnit.MILLISECOND ? 1000 : 1; + const values: bigint[] = []; + const data = createTime64(length, nullBitmap, multiple, values); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, data })]) }; +} + function generateList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * 3, nullCount * 3)): GeneratedVector { const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index a73183a7a5d47..b261e4f815e3a 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -64,6 +64,10 @@ describe('Generated Test Data', () => { describe('DictionaryBuilder', () => { validateBuilder(generate.dictionary); }); describe('IntervalDayTimeBuilder', () => { validateBuilder(generate.intervalDayTime); }); describe('IntervalYearMonthBuilder', () => { validateBuilder(generate.intervalYearMonth); }); + describe('DurationSecondBuilder', () => { validateBuilder(generate.durationSecond); }); + describe('DurationMillisecondBuilder', () => { validateBuilder(generate.durationMillisecond); }); + describe('DurationMicrosecondBuilder', () => { validateBuilder(generate.durationMicrosecond); }); + describe('DurationNanosecondBuilder', () => { validateBuilder(generate.durationNanosecond); }); describe('FixedSizeListBuilder', () => { validateBuilder(generate.fixedSizeList); }); describe('MapBuilder', () => { validateBuilder(generate.map); }); }); diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index 90cf0d598aa6f..d64c7c188d3ed 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -58,6 +58,10 @@ describe('Generated Test Data', () => { describe('Dictionary', () => { validateVector(generate.dictionary()); }); describe('IntervalDayTime', () => { validateVector(generate.intervalDayTime()); }); describe('IntervalYearMonth', () => { validateVector(generate.intervalYearMonth()); }); + describe('DurationSecond', () => { validateVector(generate.durationSecond()); }); + describe('DurationMillisecond', () => { validateVector(generate.durationMillisecond()); }); + describe('DurationMicrosecond', () => { validateVector(generate.durationMicrosecond()); }); + describe('DurationNanosecond', () => { validateVector(generate.durationNanosecond()); }); describe('FixedSizeList', () => { validateVector(generate.fixedSizeList()); }); describe('Map', () => { validateVector(generate.map()); }); }); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index 645fcc60f8d90..8a7ba1ed778aa 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -25,6 +25,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, } from 'apache-arrow'; @@ -46,6 +47,7 @@ class BasicVisitor extends Visitor { public visitUnion(type: T) { return (this.type = type); } public visitDictionary(type: T) { return (this.type = type); } public visitInterval(type: T) { return (this.type = type); } + public visitDuration(type: T) { return (this.type = type); } public visitFixedSizeList(type: T) { return (this.type = type); } public visitMap(type: T) { return (this.type = type); } } @@ -86,6 +88,10 @@ class FeatureVisitor extends Visitor { public visitDictionary(type: T) { return (this.type = type); } public visitIntervalDayTime(type: T) { return (this.type = type); } public visitIntervalYearMonth(type: T) { return (this.type = type); } + public visitDurationSecond(type: T) { return (this.type = type); } + public visitDurationMillisecond(type: T) { return (this.type = type); } + public visitDurationMicrosecond(type: T) { return (this.type = type); } + public visitDurationNanosecond(type: T) { return (this.type = type); } public visitFixedSizeList(type: T) { return (this.type = type); } public visitMap(type: T) { return (this.type = type); } } @@ -109,6 +115,7 @@ describe('Visitor', () => { test(`visits Union types`, () => validateBasicVisitor(new Union(0, [] as any[], [] as any[]))); test(`visits Dictionary types`, () => validateBasicVisitor(new Dictionary(null as any, null as any))); test(`visits Interval types`, () => validateBasicVisitor(new Interval(0))); + test(`visits Duration types`, () => validateBasicVisitor(new Duration(0))); test(`visits FixedSizeList types`, () => validateBasicVisitor(new FixedSizeList(2, null as any))); test(`visits Map types`, () => validateBasicVisitor(new Map_(new Field('', new Struct<{ key: Utf8; value: Int }>([ new Field('key', new Utf8()), new Field('value', new Int8()) @@ -158,6 +165,10 @@ describe('Visitor', () => { test(`visits IntervalDayTime types`, () => validateFeatureVisitor(new IntervalDayTime())); test(`visits IntervalYearMonth types`, () => validateFeatureVisitor(new IntervalYearMonth())); test(`visits FixedSizeList types`, () => validateFeatureVisitor(new FixedSizeList(2, null as any))); + test(`visits DurationSecond types`, () => validateFeatureVisitor(new DurationSecond())); + test(`visits DurationMillisecond types`, () => validateFeatureVisitor(new DurationMillisecond())); + test(`visits DurationMicrosecond types`, () => validateFeatureVisitor(new DurationMicrosecond())); + test(`visits DurationNanosecond types`, () => validateFeatureVisitor(new DurationNanosecond())); test(`visits Map types`, () => validateFeatureVisitor(new Map_(new Field('', new Struct<{ key: Utf8; value: Int }>([ new Field('key', new Utf8()), new Field('value', new Int8()) ] as any[]))))); diff --git a/js/yarn.lock b/js/yarn.lock index 66ede59a598b1..f027be218245f 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -2495,21 +2495,7 @@ del-cli@5.1.0: del "^7.1.0" meow "^10.1.3" -del@7.0.0: - version "7.0.0" - resolved "https://registry.yarnpkg.com/del/-/del-7.0.0.tgz#79db048bec96f83f344b46c1a66e35d9c09fe8ac" - integrity sha512-tQbV/4u5WVB8HMJr08pgw0b6nG4RGt/tj+7Numvq+zqcvUFeMaIWWOUFltiU+6go8BSO2/ogsB4EasDaj0y68Q== - dependencies: - globby "^13.1.2" - graceful-fs "^4.2.10" - is-glob "^4.0.3" - is-path-cwd "^3.0.0" - is-path-inside "^4.0.0" - p-map "^5.5.0" - rimraf "^3.0.2" - slash "^4.0.0" - -del@^7.1.0: +del@7.1.0, del@^7.1.0: version "7.1.0" resolved "https://registry.yarnpkg.com/del/-/del-7.1.0.tgz#0de0044d556b649ff05387f1fa7c885e155fd1b6" integrity sha512-v2KyNk7efxhlyHpjEvfyxaAihKKK0nWCuf6ZtqZcFFpQRG0bJ12Qsr0RpvsICMjAAZ8DOVCxrlqpxISlMHC4Kg== @@ -2766,10 +2752,10 @@ escape-string-regexp@^4.0.0: resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz#14ba83a5d373e3d311e5afca29cf5bfad965bf34" integrity sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA== -eslint-plugin-jest@27.2.3: - version "27.2.3" - resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-27.2.3.tgz#6f8a4bb2ca82c0c5d481d1b3be256ab001f5a3ec" - integrity sha512-sRLlSCpICzWuje66Gl9zvdF6mwD5X86I4u55hJyFBsxYOsBCmT5+kSUjf+fkFWVMMgpzNEupjW8WzUqi83hJAQ== +eslint-plugin-jest@27.4.2: + version "27.4.2" + resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-27.4.2.tgz#181d999ac67a9b6040db1d27935887cf5a2882ed" + integrity sha512-3Nfvv3wbq2+PZlRTf2oaAWXWwbdBejFRBR2O8tAO67o+P8zno+QGbcDYaAXODlreXVg+9gvWhKKmG2rgfb8GEg== dependencies: "@typescript-eslint/utils" "^5.10.0" @@ -3461,40 +3447,40 @@ glogg@^1.0.0: dependencies: sparkles "^1.0.0" -google-closure-compiler-java@^20230502.0.0: - version "20230502.0.0" - resolved "https://registry.yarnpkg.com/google-closure-compiler-java/-/google-closure-compiler-java-20230502.0.0.tgz#111240655adf9d64a0ac7eb16f73e896f3f9cefd" - integrity sha512-2nMQPQz2ppU9jvHhz2zpUP5jBDAqZp4gFVOEvirEyfUuLLkHwAvU2Tl1c7xaKX+Z4uMxpxttxcwdIjQhV2g8eQ== +google-closure-compiler-java@^20230802.0.0: + version "20230802.0.0" + resolved "https://registry.yarnpkg.com/google-closure-compiler-java/-/google-closure-compiler-java-20230802.0.0.tgz#5de4679f3d014b6b66471a48fb82c2772db4c872" + integrity sha512-PWKLMLwj7pR/U0yYbiy649LLqAscu+F1gyY4Y/jK6CmSLb8cIJbL8BTJd00828TzTNfWnYwxbkcQw0y9C2YsGw== -google-closure-compiler-linux@^20230502.0.0: - version "20230502.0.0" - resolved "https://registry.yarnpkg.com/google-closure-compiler-linux/-/google-closure-compiler-linux-20230502.0.0.tgz#c71114611b7ca47febd6feb1289ae152ca020b92" - integrity sha512-4NDgPKJXQHUxEyJoVFPVMQPJs5at7ThOXa9u3+9UeYk2K+vtW5wVZlmW07VOy8Mk/O/n2dp+Vl+wuE35BIiHAA== +google-closure-compiler-linux@^20230802.0.0: + version "20230802.0.0" + resolved "https://registry.yarnpkg.com/google-closure-compiler-linux/-/google-closure-compiler-linux-20230802.0.0.tgz#1acaf12ef386e5c1dcb5ff5796d4ae9f48ebce46" + integrity sha512-F13U4iSXiWeGtHOFS25LVem1s6zI+pJvXVPVR7zSib5ppoUJ0JXnABJQezUR3FnpxmnkALG4oIGW0syH9zPLZA== -google-closure-compiler-osx@^20230502.0.0: - version "20230502.0.0" - resolved "https://registry.yarnpkg.com/google-closure-compiler-osx/-/google-closure-compiler-osx-20230502.0.0.tgz#9ea082f0c6ad40b829802f0993f2e5b4b0e079e8" - integrity sha512-jB13dcbu8O02cG3JcCCVZku1oI0ZirJc/Sr9xcGHY5MMyw3qEMlXb3IU97W6UXLcg2wCRawMWadOwL9K4L9lfQ== +google-closure-compiler-osx@^20230802.0.0: + version "20230802.0.0" + resolved "https://registry.yarnpkg.com/google-closure-compiler-osx/-/google-closure-compiler-osx-20230802.0.0.tgz#10746ecfa81ad6eecc4d42d4ce9d0ed3ca8071e7" + integrity sha512-ANAi/ux92Tt+Na7vFDLeK2hRzotjC5j+nxoPtE0OcuNcbjji5dREKoJxkq7r0YwRTCzAFZszK5ip/NPdTOdCEg== -google-closure-compiler-windows@^20230502.0.0: - version "20230502.0.0" - resolved "https://registry.yarnpkg.com/google-closure-compiler-windows/-/google-closure-compiler-windows-20230502.0.0.tgz#81eef5de8b86364716b77a2d8068afba8b0e8244" - integrity sha512-wW5/liBxejvUViiBNo8/C9Vnhw+Lm+n3RdfE4spNkmdH9bcpKM+KQBLrPPakW17P3HbAPOPZ0L1RsrmyLYA5Cg== +google-closure-compiler-windows@^20230802.0.0: + version "20230802.0.0" + resolved "https://registry.yarnpkg.com/google-closure-compiler-windows/-/google-closure-compiler-windows-20230802.0.0.tgz#d57968dc24d5e0d538840b4313e1bec7c71b18d6" + integrity sha512-ZQPujoNiiUyTGl8zEGR/0yAygWnbMtX/NQ/S/EHVgq5nmYkvDEVuiVbgpPAmO9lzBTq0hvUTRRATZbTU2ISxgA== -google-closure-compiler@20230502.0.0: - version "20230502.0.0" - resolved "https://registry.yarnpkg.com/google-closure-compiler/-/google-closure-compiler-20230502.0.0.tgz#65b19e673255b4b4dad4271724932e0970b11a97" - integrity sha512-C2WZkuRnXpNjU2nc0W/Cgxm6t2VlwEyUJOTaGHaLr6qZCXK0L1uhOneKWN2X7AORKdzyLW6Tq8ONxRc7eODGJg== +google-closure-compiler@20230802.0.0: + version "20230802.0.0" + resolved "https://registry.yarnpkg.com/google-closure-compiler/-/google-closure-compiler-20230802.0.0.tgz#849181359823f8c9130faec9a1597377680823d6" + integrity sha512-o2fYoc8lqOBdhm95Ick0vWrtwH2Icd5yLZhbTcQ0T7NfGiBepYvx1BB63hR8ebgzEZemz9Fh+O6Kg/3Mjm28ww== dependencies: chalk "4.x" - google-closure-compiler-java "^20230502.0.0" + google-closure-compiler-java "^20230802.0.0" minimist "1.x" vinyl "2.x" vinyl-sourcemaps-apply "^0.2.0" optionalDependencies: - google-closure-compiler-linux "^20230502.0.0" - google-closure-compiler-osx "^20230502.0.0" - google-closure-compiler-windows "^20230502.0.0" + google-closure-compiler-linux "^20230802.0.0" + google-closure-compiler-osx "^20230802.0.0" + google-closure-compiler-windows "^20230802.0.0" graceful-fs@^4.0.0, graceful-fs@^4.1.11, graceful-fs@^4.1.2, graceful-fs@^4.1.6, graceful-fs@^4.2.0, graceful-fs@^4.2.10, graceful-fs@^4.2.4, graceful-fs@^4.2.9: version "4.2.11" @@ -4931,10 +4917,10 @@ math-random@^1.0.1: resolved "https://registry.yarnpkg.com/math-random/-/math-random-1.0.4.tgz#5dd6943c938548267016d4e34f057583080c514c" integrity sha512-rUxjysqif/BZQH2yhd5Aaq7vXMSx9NdEsQcyA07uEzIvxgI7zIr33gGsh+RU0/XjmQpCW7RsVof1vlkvQVCK5A== -memfs@4.2.1: - version "4.2.1" - resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.2.1.tgz#8c5a48707a460dde8e734b15e405e8377db2bec5" - integrity sha512-CINEB6cNAAhLUfRGrB4lj2Pj47ygerEmw3jxPb6R1gkD6Jfp484gJLteQ6MzqIjGWtFWuVzDl+KN7HiipMuKSw== +memfs@4.5.0: + version "4.5.0" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.5.0.tgz#03082709987760022275e0d3bc0f24545b7fe279" + integrity sha512-8QePW5iXi/ZCySFTo39h3ujKGT0rYVnZywuSo5AzR7POAuy4uBEFZKziYkkrlGdWuxACUxKAJ0L/sry3DSG+TA== dependencies: json-joy "^9.2.0" thingies "^1.11.1" diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc index ec1ac1eecb2fd..023381e005969 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc @@ -34,7 +34,6 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(Schema, getFieldByName); REGISTER_METHOD(Schema, getNumFields); REGISTER_METHOD(Schema, getFieldNames); - REGISTER_METHOD(Schema, toString); } libmexclass::proxy::MakeResult Schema::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { @@ -141,14 +140,4 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = field_names_mda; } - void Schema::toString(libmexclass::proxy::method::Context& context) { - namespace mda = ::matlab::data; - mda::ArrayFactory factory; - - const auto str_utf8 = schema->ToString(); - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); - auto str_mda = factory.createScalar(str_utf16); - context.outputs[0] = str_mda; - } - } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h index 30883bc2a85ac..9ca4a94e53071 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h @@ -39,7 +39,6 @@ namespace arrow::matlab::tabular::proxy { void getFieldByName(libmexclass::proxy::method::Context& context); void getNumFields(libmexclass::proxy::method::Context& context); void getFieldNames(libmexclass::proxy::method::Context& context); - void toString(libmexclass::proxy::method::Context& context); std::shared_ptr schema; }; diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc index 7df0e7d6ef304..138771a35c327 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc @@ -32,7 +32,6 @@ namespace arrow::matlab::type::proxy { Field::Field(std::shared_ptr field) : field{std::move(field)} { REGISTER_METHOD(Field, getName); REGISTER_METHOD(Field, getType); - REGISTER_METHOD(Field, toString); } std::shared_ptr Field::unwrap() { @@ -64,16 +63,6 @@ namespace arrow::matlab::type::proxy { context.outputs[0] = output; } - void Field::toString(libmexclass::proxy::method::Context& context) { - namespace mda = ::matlab::data; - mda::ArrayFactory factory; - - const auto str_utf8 = field->ToString(); - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); - auto str_mda = factory.createScalar(str_utf16); - context.outputs[0] = str_mda; - } - libmexclass::proxy::MakeResult Field::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { namespace mda = ::matlab::data; using FieldProxy = arrow::matlab::type::proxy::Field; diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.h b/matlab/src/cpp/arrow/matlab/type/proxy/field.h index 4256fd21a0a23..3526a6c422ac3 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/field.h +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.h @@ -36,7 +36,6 @@ class Field : public libmexclass::proxy::Proxy { protected: void getName(libmexclass::proxy::method::Context& context); void getType(libmexclass::proxy::method::Context& context); - void toString(libmexclass::proxy::method::Context& context); std::shared_ptr field; }; diff --git a/matlab/src/matlab/+arrow/+array/StructArray.m b/matlab/src/matlab/+arrow/+array/StructArray.m index 589e39fecd015..800e34fe746ec 100644 --- a/matlab/src/matlab/+arrow/+array/StructArray.m +++ b/matlab/src/matlab/+arrow/+array/StructArray.m @@ -142,5 +142,38 @@ proxy = arrow.internal.proxy.create(proxyName, args); array = arrow.array.StructArray(proxy); end + + function array = fromMATLAB(T, opts) + arguments + T table + opts.FieldNames(1, :) string {mustBeNonmissing} = T.Properties.VariableNames + opts.Valid + end + + import arrow.tabular.internal.decompose + import arrow.tabular.internal.validateColumnNames + import arrow.array.internal.getArrayProxyIDs + import arrow.internal.validate.parseValid + + if width(T) == 0 + % StructArrays require at least one field + error("arrow:struct:ZeroVariables", ... + "Input table T must have at least one variable."); + end + + % If FieldNames was provided, make sure the number of field + % names is equal to the width of the table. + validateColumnNames(opts.FieldNames, width(T)); + + arrowArrays = decompose(T); + arrayProxyIDs = getArrayProxyIDs(arrowArrays); + validElements = parseValid(opts, height(T)); + + args = struct(ArrayProxyIDs=arrayProxyIDs, ... + FieldNames=opts.FieldNames, Valid=validElements); + proxyName = "arrow.array.proxy.StructArray"; + proxy = arrow.internal.proxy.create(proxyName, args); + array = arrow.array.StructArray(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeDimensionString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeDimensionString.m new file mode 100644 index 0000000000000..4281667543634 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeDimensionString.m @@ -0,0 +1,22 @@ +%MAKEDIMENSIONSTRING Utility function for creating a string representation +%of dimensions. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function dimensionString = makeDimensionString(arraySize) + dimensionString = string(arraySize); + dimensionString = join(dimensionString, char(215)); +end diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m new file mode 100644 index 0000000000000..df6a11612043c --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m @@ -0,0 +1,36 @@ +%MAKELINKSTRING Utility function for creating hyperlinks. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function link = makeLinkString(opts) + arguments + opts.FullClassName(1, 1) string + opts.ClassName(1, 1) string + % When displaying heterogeneous arrays, only the name of the + % closest shared anscestor class is displayed in bold. All other + % class names are not bolded. + opts.BoldFont(1, 1) logical + end + + if opts.BoldFont + link = compose("%s", ... + opts.FullClassName, opts.ClassName); + else + link = compose("%s", ... + opts.FullClassName, opts.ClassName); + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/verify.m b/matlab/src/matlab/+arrow/+internal/+test/+display/verify.m new file mode 100644 index 0000000000000..d9a420663b783 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/verify.m @@ -0,0 +1,32 @@ +%VERIFY Utility function used to verify object display. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function verify(testCase, actualDisplay, expectedDisplay) + % When the MATLAB GUI is running, '×' (char(215)) is used as + % the delimiter between dimension values. However, when the + % GUI is not running, 'x' (char(120)) is used as the delimiter. + % To account for this discrepancy, check if actualDisplay + % contains char(215). If not, replace all instances of + % char(215) in expectedDisplay with char(120). + + tf = contains(actualDisplay, char(215)); + if ~tf + idx = strfind(expectedDisplay, char(215)); + expectedDisplay(idx) = char(120); + end + testCase.verifyEqual(actualDisplay, expectedDisplay); +end diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m b/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m new file mode 100644 index 0000000000000..8d6740b195abc --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m @@ -0,0 +1,50 @@ +%DISPLAYSCHEMA Generates arrow.tabular.Schema display text. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function text = displaySchema(schema) + fields = schema.Fields; + names = [fields.Name]; + types = [fields.Type]; + typeIDs = string([types.ID]); + + % Use as the sentinel for field names with zero characters. + idx = strlength(names) == 0; + names(idx) = ""; + + if usejava("desktop") + % When in desktop mode, the Command Window can interpret HTML tags + % to display bold font and hyperlinks. + names = compose("%s", names); + classNames = arrayfun(@(type) string(class(type)), types); + + % Creates a string array with the following form: + % + % ["arrow.type.BooleanType" "Boolean" "arrow.type.StringType" "String" ...] + % + % This string array is passed to the compose call below. The + % format specifier operator supplied to compose contains two + % formatting operators (%s), so compose uses two elements from the + % string array (classNameAndIDs) at a time. + classNameAndIDs = strings([1 numel(typeIDs) * 2]); + classNameAndIDs(1:2:end-1) = classNames; + classNameAndIDs(2:2:end) = typeIDs; + typeIDs = compose("%s", classNameAndIDs); + end + + text = names + ": " + typeIDs; + text = " " + strjoin(text, " | "); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/Schema.m b/matlab/src/matlab/+arrow/+tabular/Schema.m index f679b1e0bc22c..3ee40f0e14293 100644 --- a/matlab/src/matlab/+arrow/+tabular/Schema.m +++ b/matlab/src/matlab/+arrow/+tabular/Schema.m @@ -97,18 +97,29 @@ end end - methods (Access = private) + methods (Access=protected) - function str = toString(obj) - str = obj.Proxy.toString(); + function header = getHeader(obj) + name = matlab.mixin.CustomDisplay.getClassNameForHeader(obj); + numFields = obj.NumFields; + if numFields == 0 + header = compose(" Arrow %s with 0 fields" + newline, name); + elseif numFields == 1 + header = compose(" Arrow %s with %d field:" + newline, name, numFields); + else + header = compose(" Arrow %s with %d fields:" + newline, name, numFields); + end end - end + function displayScalarObject(obj) + disp(getHeader(obj)); + numFields = obj.NumFields; - methods (Access=protected) + if numFields > 0 + text = arrow.tabular.internal.displaySchema(obj); + disp(text + newline); + end - function displayScalarObject(obj) - disp(obj.toString()); end end diff --git a/matlab/src/matlab/+arrow/+type/+traits/StructTraits.m b/matlab/src/matlab/+arrow/+type/+traits/StructTraits.m index 0f8b7b3a2a663..adab036f27855 100644 --- a/matlab/src/matlab/+arrow/+type/+traits/StructTraits.m +++ b/matlab/src/matlab/+arrow/+type/+traits/StructTraits.m @@ -19,10 +19,7 @@ ArrayConstructor = @arrow.array.StructArray ArrayClassName = "arrow.array.StructArray" ArrayProxyClassName = "arrow.array.proxy.StructArray" - - % TODO: Implement fromMATLAB - ArrayStaticConstructor = missing - + ArrayStaticConstructor = @arrow.array.StructArray.fromMATLAB TypeConstructor = @arrow.type.StructType TypeClassName = "arrow.type.StructType" TypeProxyClassName = "arrow.type.proxy.StructType" diff --git a/matlab/src/matlab/+arrow/+type/+traits/traits.m b/matlab/src/matlab/+arrow/+type/+traits/traits.m index f737108ce5f76..9badf63eebb81 100644 --- a/matlab/src/matlab/+arrow/+type/+traits/traits.m +++ b/matlab/src/matlab/+arrow/+type/+traits/traits.m @@ -91,6 +91,8 @@ typeTraits = TimestampTraits(); case "duration" typeTraits = Time64Traits(); + case "table" + typeTraits = StructTraits(); otherwise error("arrow:type:traits:UnsupportedMatlabClass", "Unsupported MATLAB class: " + type); end diff --git a/matlab/src/matlab/+arrow/+type/Field.m b/matlab/src/matlab/+arrow/+type/Field.m index f67ba69fe9826..d6e03f61fbea1 100644 --- a/matlab/src/matlab/+arrow/+type/Field.m +++ b/matlab/src/matlab/+arrow/+type/Field.m @@ -91,16 +91,10 @@ end end - methods (Access = private) - function str = toString(obj) - str = obj.Proxy.toString(); - end - end - methods (Access=protected) - function displayScalarObject(obj) - disp(obj.toString()); + function groups = getPropertyGroups(~) + targets = ["Name", "Type"]; + groups = matlab.mixin.util.PropertyGroup(targets); end end - end diff --git a/matlab/src/matlab/+arrow/array.m b/matlab/src/matlab/+arrow/array.m index 983b3c88680c4..e34eb8b3fcc6c 100644 --- a/matlab/src/matlab/+arrow/array.m +++ b/matlab/src/matlab/+arrow/array.m @@ -13,46 +13,42 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -function arrowArray = array(data, opts) - arguments - data - opts.InferNulls(1, 1) logical = true - opts.Valid - end +function arrowArray = array(data, varargin) data = convertCellstrToString(data); classname = string(class(data)); - args = namedargs2cell(opts); switch (classname) case "logical" - arrowArray = arrow.array.BooleanArray.fromMATLAB(data, args{:}); + arrowArray = arrow.array.BooleanArray.fromMATLAB(data, varargin{:}); case "uint8" - arrowArray = arrow.array.UInt8Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.UInt8Array.fromMATLAB(data, varargin{:}); case "uint16" - arrowArray = arrow.array.UInt16Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.UInt16Array.fromMATLAB(data, varargin{:}); case "uint32" - arrowArray = arrow.array.UInt32Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.UInt32Array.fromMATLAB(data, varargin{:}); case "uint64" - arrowArray = arrow.array.UInt64Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.UInt64Array.fromMATLAB(data, varargin{:}); case "int8" - arrowArray = arrow.array.Int8Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Int8Array.fromMATLAB(data, varargin{:}); case "int16" - arrowArray = arrow.array.Int16Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Int16Array.fromMATLAB(data, varargin{:}); case "int32" - arrowArray = arrow.array.Int32Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Int32Array.fromMATLAB(data, varargin{:}); case "int64" - arrowArray = arrow.array.Int64Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Int64Array.fromMATLAB(data, varargin{:}); case "single" - arrowArray = arrow.array.Float32Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Float32Array.fromMATLAB(data, varargin{:}); case "double" - arrowArray = arrow.array.Float64Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Float64Array.fromMATLAB(data, varargin{:}); case "string" - arrowArray = arrow.array.StringArray.fromMATLAB(data, args{:}); + arrowArray = arrow.array.StringArray.fromMATLAB(data, varargin{:}); case "datetime" - arrowArray = arrow.array.TimestampArray.fromMATLAB(data, args{:}); + arrowArray = arrow.array.TimestampArray.fromMATLAB(data, varargin{:}); case "duration" - arrowArray = arrow.array.Time64Array.fromMATLAB(data, args{:}); + arrowArray = arrow.array.Time64Array.fromMATLAB(data, varargin{:}); + case "table" + arrowArray = arrow.array.StructArray.fromMATLAB(data, varargin{:}); otherwise errid = "arrow:array:UnsupportedMATLABType"; msg = join(["Unable to convert MATLAB type" classname "to arrow array."]); diff --git a/matlab/test/arrow/array/tArray.m b/matlab/test/arrow/array/tArray.m index 54b31270b25d2..545d382ddf7f4 100644 --- a/matlab/test/arrow/array/tArray.m +++ b/matlab/test/arrow/array/tArray.m @@ -32,7 +32,8 @@ {[1 2], "arrow.array.Float64Array"}, ... {datetime(2022, 1, 1), "arrow.array.TimestampArray"}, ... {seconds([1 2]), "arrow.array.Time64Array"}, ... - {["A" "B"], "arrow.array.StringArray"}}; + {["A" "B"], "arrow.array.StringArray"}, ... + {table(["A" "B"]'), "arrow.array.StructArray"}}; end methods(Test) @@ -50,7 +51,7 @@ function UnsupportedMATLABTypeError(testCase) % Verify arrow.array throws an error with the identifier % "arrow:array:UnsupportedMATLABType" if the input array is not one % we support converting into an Arrow array. - matlabArray = table; + matlabArray = {table}; fcn = @() arrow.array(matlabArray); errID = "arrow:array:UnsupportedMATLABType"; testCase.verifyError(fcn, errID); diff --git a/matlab/test/arrow/array/tStructArray.m b/matlab/test/arrow/array/tStructArray.m index 639df65befbf5..83e902ee2fa23 100644 --- a/matlab/test/arrow/array/tStructArray.m +++ b/matlab/test/arrow/array/tStructArray.m @@ -273,5 +273,91 @@ function IsEqualFalse(tc) tc.verifyFalse(isequal(array1, array3)); end + function FromMATLABBasic(tc) + % Verify StructArray.fromMATLAB returns the expected + % StructArray. + import arrow.array.StructArray + + T = table([1 2]', ["A1" "A2"]', VariableNames=["Number" "String"]); + array = StructArray.fromMATLAB(T); + tc.verifyEqual(array.Length, int64(2)); + tc.verifyEqual(array.NumFields, int32(2)); + tc.verifyEqual(array.FieldNames, ["Number" "String"]); + + field1 = arrow.array([1 2]'); + field2 = arrow.array(["A1" "A2"]'); + + tc.verifyEqual(field1, array.field(1)); + tc.verifyEqual(field2, array.field(2)); + end + + function FromMATLABFieldNames(tc) + % Verify StructArray.fromMATLAB returns the expected + % StructArray when the FieldNames nv-pair is supplied. + import arrow.array.StructArray + + T = table([1 2]', ["A1" "A2"]', VariableNames=["Number" "String"]); + array = StructArray.fromMATLAB(T, FieldNames=["Custom" "Name"]); + tc.verifyEqual(array.Length, int64(2)); + tc.verifyEqual(array.NumFields, int32(2)); + tc.verifyEqual(array.FieldNames, ["Custom" "Name"]); + tc.verifyEqual(array.Valid, [true; true]); + + field1 = arrow.array([1 2]'); + field2 = arrow.array(["A1" "A2"]'); + + tc.verifyEqual(field1, array.field(1)); + tc.verifyEqual(field2, array.field(2)); + end + + function FromMATLABValid(tc) + % Verify StructArray.fromMATLAB returns the expected + % StructArray when the Valid nv-pair is supplied. + + import arrow.array.StructArray + + T = table([1 2]', ["A1" "A2"]', VariableNames=["Number" "String"]); + array = StructArray.fromMATLAB(T, Valid=2); + tc.verifyEqual(array.Length, int64(2)); + tc.verifyEqual(array.NumFields, int32(2)); + tc.verifyEqual(array.FieldNames, ["Number" "String"]); + tc.verifyEqual(array.Valid, [false; true]); + + field1 = arrow.array([1 2]'); + field2 = arrow.array(["A1" "A2"]'); + + tc.verifyEqual(field1, array.field(1)); + tc.verifyEqual(field2, array.field(2)); + end + + function FromMATLABZeroVariablesError(tc) + % Verify StructArray.fromMATLAB throws an error when the input + % table T has zero variables. + import arrow.array.StructArray + + fcn = @() StructArray.fromMATLAB(table); + tc.verifyError(fcn, "arrow:struct:ZeroVariables"); + end + + function FromMATLABWrongNumberFieldNames(tc) + % Verify StructArray.fromMATLAB throws an error when the + % FieldNames nv-pair is provided and its number of elements + % does not equal the number of variables in the input table T. + + import arrow.array.StructArray + + fcn = @() StructArray.fromMATLAB(table(1), FieldNames=["A" "B"]); + tc.verifyError(fcn, "arrow:tabular:WrongNumberColumnNames"); + end + + function FromMATLABValidNVPairBadIndex(tc) + % Verify StructArray.fromMATLAB throws an error when the + % Valid nv-pair is provided and it contains an invalid index. + + import arrow.array.StructArray + + fcn = @() StructArray.fromMATLAB(table(1), Valid=2); + tc.verifyError(fcn, "MATLAB:notLessEqual"); + end end end \ No newline at end of file diff --git a/matlab/test/arrow/tabular/tSchema.m b/matlab/test/arrow/tabular/tSchema.m index e4c706d9a3d6c..bb95c1823b9fc 100644 --- a/matlab/test/arrow/tabular/tSchema.m +++ b/matlab/test/arrow/tabular/tSchema.m @@ -526,7 +526,70 @@ function TestIsEqualFalse(testCase) % Compare schema to double testCase.verifyFalse(isequal(schema4, 5)); + end + + function TestDisplaySchemaZeroFields(testCase) + import arrow.internal.test.display.makeLinkString + + schema = arrow.schema(arrow.type.Field.empty(0, 0)); %#ok + classnameLink = makeLinkString(FullClassName="arrow.tabular.Schema",... + ClassName="Schema", BoldFont=true); + expectedDisplay = " Arrow " + classnameLink + " with 0 fields" + newline; + expectedDisplay = char(expectedDisplay + newline); + actualDisplay = evalc('disp(schema)'); + testCase.verifyEqual(actualDisplay, char(expectedDisplay)); + end + + function TestDisplaySchemaOneField(testCase) + import arrow.internal.test.display.makeLinkString + + schema = arrow.schema(arrow.field("TestField", arrow.boolean())); %#ok + classnameLink = makeLinkString(FullClassName="arrow.tabular.Schema",... + ClassName="Schema", BoldFont=true); + header = " Arrow " + classnameLink + " with 1 field:" + newline; + indent = " "; + + if usejava("desktop") + type = makeLinkString(FullClassName="arrow.type.BooleanType", ... + ClassName="Boolean", BoldFont=true); + name = "TestField: "; + fieldLine = indent + name + type + newline; + else + fieldLine = indent + "TestField: Boolean" + newline; + end + expectedDisplay = join([header, fieldLine], newline); + expectedDisplay = char(expectedDisplay + newline); + actualDisplay = evalc('disp(schema)'); + testCase.verifyEqual(actualDisplay, char(expectedDisplay)); + end + function TestDisplaySchemaField(testCase) + import arrow.internal.test.display.makeLinkString + + field1 = arrow.field("Field1", arrow.timestamp()); + field2 = arrow.field("Field2", arrow.string()); + schema = arrow.schema([field1, field2]); %#ok + classnameLink = makeLinkString(FullClassName="arrow.tabular.Schema",... + ClassName="Schema", BoldFont=true); + header = " Arrow " + classnameLink + " with 2 fields:" + newline; + + indent = " "; + if usejava("desktop") + type1 = makeLinkString(FullClassName="arrow.type.TimestampType", ... + ClassName="Timestamp", BoldFont=true); + field1String = "Field1: " + type1; + type2 = makeLinkString(FullClassName="arrow.type.StringType", ... + ClassName="String", BoldFont=true); + field2String = "Field2: " + type2; + fieldLine = indent + field1String + " | " + field2String + newline; + else + fieldLine = indent + "Field1: Timestamp | Field2: String" + newline; + end + + expectedDisplay = join([header, fieldLine], newline); + expectedDisplay = char(expectedDisplay + newline); + actualDisplay = evalc('disp(schema)'); + testCase.verifyEqual(actualDisplay, char(expectedDisplay)); end end diff --git a/matlab/test/arrow/type/tField.m b/matlab/test/arrow/type/tField.m index 1a89c0077b5ae..f84034d032c23 100644 --- a/matlab/test/arrow/type/tField.m +++ b/matlab/test/arrow/type/tField.m @@ -231,5 +231,33 @@ function TestIsEqualNonScalarFalse(testCase) % Compare arrow.type.Field array and a string array testCase.verifyFalse(isequal(f1, strings(size(f1)))); end + + function TestDisplay(testCase) + % Verify the display of Field objects. + % + % Example: + % + % Field with properties: + % + % Name: FieldA + % Type: [1x2 arrow.type.Int32Type] + + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + import arrow.internal.test.display.makeDimensionString + + field = arrow.field("B", arrow.timestamp(TimeZone="America/Anchorage")); %#ok + classnameLink = makeLinkString(FullClassName="arrow.type.Field", ClassName="Field", BoldFont=true); + header = " " + classnameLink + " with properties:" + newline; + body = strjust(pad(["Name:"; "Type:"])); + dimensionString = makeDimensionString([1 1]); + fieldString = compose("[%s %s]", dimensionString, "arrow.type.TimestampType"); + body = body + " " + ["""B"""; fieldString]; + body = " " + body; + footer = string(newline); + expectedDisplay = char(strjoin([header body' footer], newline)); + actualDisplay = evalc('disp(field)'); + verify(testCase, actualDisplay, expectedDisplay); + end end end diff --git a/matlab/test/arrow/type/tTypeDisplay.m b/matlab/test/arrow/type/tTypeDisplay.m index f84c5ab56e270..6f5a4bcd97717 100644 --- a/matlab/test/arrow/type/tTypeDisplay.m +++ b/matlab/test/arrow/type/tTypeDisplay.m @@ -50,6 +50,10 @@ function EmptyTypeDisplay(testCase) % % ID + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + import arrow.internal.test.display.makeDimensionString + type = arrow.type.Type.empty(0, 1); typeLink = makeLinkString(FullClassName="arrow.type.Type", ClassName="Type", BoldFont=true); dimensionString = makeDimensionString(size(type)); @@ -59,7 +63,7 @@ function EmptyTypeDisplay(testCase) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyDisplay(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function NonScalarArrayDifferentTypes(testCase) @@ -71,6 +75,10 @@ function NonScalarArrayDifferentTypes(testCase) % % ID + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + import arrow.internal.test.display.makeDimensionString + float32Type = arrow.float32(); timestampType = arrow.timestamp(); typeArray = [float32Type timestampType]; @@ -88,7 +96,7 @@ function NonScalarArrayDifferentTypes(testCase) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(typeArray)'); - testCase.verifyDisplay(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function NonScalarArraySameTypes(testCase) @@ -102,6 +110,10 @@ function NonScalarArraySameTypes(testCase) % TimeUnit % TimeZone + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + import arrow.internal.test.display.makeDimensionString + timestampType1 = arrow.timestamp(TimeZone="Pacific/Fiji"); timestampType2 = arrow.timestamp(TimeUnit="Second"); typeArray = [timestampType1 timestampType2]; @@ -114,7 +126,7 @@ function NonScalarArraySameTypes(testCase) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(typeArray)'); - testCase.verifyDisplay(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function TestTypeDisplaysOnlyID(testCase, TypeDisplaysOnlyID) @@ -127,6 +139,9 @@ function TestTypeDisplaysOnlyID(testCase, TypeDisplaysOnlyID) % % ID: Boolean + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + type = TypeDisplaysOnlyID; fullClassName = string(class(type)); className = reverse(extractBefore(reverse(fullClassName), ".")); @@ -136,7 +151,7 @@ function TestTypeDisplaysOnlyID(testCase, TypeDisplaysOnlyID) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyDisplay(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function TestTimeType(testCase, TimeType) @@ -149,6 +164,9 @@ function TestTimeType(testCase, TimeType) % ID: Time32 % TimeUnit: Second + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + type = TimeType; fullClassName = string(class(type)); className = reverse(extractBefore(reverse(fullClassName), ".")); @@ -161,7 +179,7 @@ function TestTimeType(testCase, TimeType) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyEqual(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function TestDateType(testCase, DateType) @@ -174,6 +192,9 @@ function TestDateType(testCase, DateType) % ID: Date32 % DateUnit: Day + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + type = DateType; fullClassName = string(class(type)); className = reverse(extractBefore(reverse(fullClassName), ".")); @@ -186,7 +207,7 @@ function TestDateType(testCase, DateType) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyEqual(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function TimestampTypeDisplay(testCase) @@ -200,6 +221,9 @@ function TimestampTypeDisplay(testCase) % TimeUnit: Second % TimeZone: "America/Anchorage" + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + type = arrow.timestamp(TimeUnit="Second", TimeZone="America/Anchorage"); %#ok classnameLink = makeLinkString(FullClassName="arrow.type.TimestampType", ClassName="TimestampType", BoldFont=true); header = " " + classnameLink + " with properties:" + newline; @@ -209,7 +233,7 @@ function TimestampTypeDisplay(testCase) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyEqual(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end function StructTypeDisplay(testCase) @@ -222,6 +246,10 @@ function StructTypeDisplay(testCase) % ID: Struct % Fields: [1x2 arrow.type.Field] + import arrow.internal.test.display.verify + import arrow.internal.test.display.makeLinkString + import arrow.internal.test.display.makeDimensionString + fieldA = arrow.field("A", arrow.int32()); fieldB = arrow.field("B", arrow.timestamp(TimeZone="America/Anchorage")); type = arrow.struct(fieldA, fieldB); %#ok @@ -235,48 +263,7 @@ function StructTypeDisplay(testCase) footer = string(newline); expectedDisplay = char(strjoin([header body' footer], newline)); actualDisplay = evalc('disp(type)'); - testCase.verifyDisplay(actualDisplay, expectedDisplay); + verify(testCase, actualDisplay, expectedDisplay); end end - - methods - function verifyDisplay(testCase, actualDisplay, expectedDisplay) - % When the MATLAB GUI is running, '×' (char(215)) is used as - % the delimiter between dimension values. However, when the - % GUI is not running, 'x' (char(120)) is used as the delimiter. - % To account for this discrepancy, check if actualDisplay - % contains char(215). If not, replace all instances of - % char(215) in expectedDisplay with char(120). - - tf = contains(actualDisplay, char(215)); - if ~tf - idx = strfind(expectedDisplay, char(215)); - expectedDisplay(idx) = char(120); - end - testCase.verifyEqual(actualDisplay, expectedDisplay); - end - end -end - -function link = makeLinkString(opts) - arguments - opts.FullClassName(1, 1) string - opts.ClassName(1, 1) string - % When displaying heterogeneous arrays, only the name of the - % closest shared anscestor class is displayed in bold. All other - % class names are not bolded. - opts.BoldFont(1, 1) logical - end - - if opts.BoldFont - link = compose("%s", ... - opts.FullClassName, opts.ClassName); - else - link = compose("%s", opts.FullClassName, opts.ClassName); - end end - -function dimensionString = makeDimensionString(arraySize) - dimensionString = string(arraySize); - dimensionString = join(dimensionString, char(215)); -end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tStructTraits.m b/matlab/test/arrow/type/traits/tStructTraits.m index 07833aca162b5..dad9ec012f3e5 100644 --- a/matlab/test/arrow/type/traits/tStructTraits.m +++ b/matlab/test/arrow/type/traits/tStructTraits.m @@ -20,7 +20,7 @@ ArrayConstructor = @arrow.array.StructArray ArrayClassName = "arrow.array.StructArray" ArrayProxyClassName = "arrow.array.proxy.StructArray" - ArrayStaticConstructor = missing + ArrayStaticConstructor = @arrow.array.StructArray.fromMATLAB TypeConstructor = @arrow.type.StructType TypeClassName = "arrow.type.StructType" TypeProxyClassName = "arrow.type.proxy.StructType" diff --git a/matlab/test/arrow/type/traits/ttraits.m b/matlab/test/arrow/type/traits/ttraits.m index 2880645f2957c..d2d80b3f8f8f5 100644 --- a/matlab/test/arrow/type/traits/ttraits.m +++ b/matlab/test/arrow/type/traits/ttraits.m @@ -365,6 +365,17 @@ function TestMatlabDuration(testCase) testCase.verifyEqual(actualTraits, expectedTraits); end + function TestMatlabTable(testCase) + import arrow.type.traits.* + + type = "table"; + expectedTraits = StructTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + function TestErrorIfUnsupportedMatlabClass(testCase) import arrow.type.traits.* diff --git a/r/NAMESPACE b/r/NAMESPACE index 21f88b4180d24..d49255f781f94 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -112,6 +112,7 @@ S3method(infer_schema,ArrowTabular) S3method(infer_schema,Dataset) S3method(infer_schema,RecordBatchReader) S3method(infer_schema,arrow_dplyr_query) +S3method(infer_schema,data.frame) S3method(infer_type,ArrowDatum) S3method(infer_type,Expression) S3method(infer_type,blob) diff --git a/r/R/csv.R b/r/R/csv.R index b119d16a84c06..116c620f83490 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -404,10 +404,10 @@ CsvTableReader$create <- function(file, #' #' - `delimiter` Field delimiting character (default `","`) #' - `quoting` Logical: are strings quoted? (default `TRUE`) -#' - `quote_char` Quoting character, if `quoting` is `TRUE` +#' - `quote_char` Quoting character, if `quoting` is `TRUE` (default `'"'`) #' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`) #' - `escaping` Logical: whether escaping is used (default `FALSE`) -#' - `escape_char` Escaping character, if `escaping` is `TRUE` +#' - `escape_char` Escaping character, if `escaping` is `TRUE` (default `"\\"`) #' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`) #' and LF (`0x0a`) characters? (default `FALSE`) #' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R index adb7353a043b9..d3d4f639e3729 100644 --- a/r/R/dataset-factory.R +++ b/r/R/dataset-factory.R @@ -49,7 +49,7 @@ DatasetFactory$create <- function(x, } if (is.character(format)) { - format <- FileFormat$create(match.arg(format), ...) + format <- FileFormat$create(match.arg(format), partitioning = partitioning, ...) } else { assert_is(format, "FileFormat") } diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index e1f434d60cd50..cdaaf08827bfd 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -74,13 +74,14 @@ FileFormat <- R6Class("FileFormat", type = function() dataset___FileFormat__type_name(self) ) ) -FileFormat$create <- function(format, schema = NULL, ...) { + +FileFormat$create <- function(format, schema = NULL, partitioning = NULL, ...) { opt_names <- names(list(...)) if (format %in% c("csv", "text", "txt") || any(opt_names %in% c("delim", "delimiter"))) { - CsvFileFormat$create(schema = schema, ...) + CsvFileFormat$create(schema = schema, partitioning = partitioning, ...) } else if (format == "tsv") { # This delimiter argument is ignored. - CsvFileFormat$create(delimiter = "\t", schema = schema, ...) + CsvFileFormat$create(delimiter = "\t", schema = schema, partitioning = partitioning, ...) } else if (format == "parquet") { ParquetFileFormat$create(...) } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing @@ -189,16 +190,19 @@ JsonFileFormat$create <- function(...) { #' #' @export CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat) -CsvFileFormat$create <- function(...) { +CsvFileFormat$create <- function(..., partitioning = NULL) { + dots <- list(...) - options <- check_csv_file_format_args(dots) - check_schema(options[["schema"]], options[["read_options"]]$column_names) + + options <- check_csv_file_format_args(dots, partitioning = partitioning) + check_schema(options[["schema"]], partitioning, options[["read_options"]]$column_names) dataset___CsvFileFormat__Make(options$parse_options, options$convert_options, options$read_options) } # Check all arguments are valid -check_csv_file_format_args <- function(args) { +check_csv_file_format_args <- function(args, partitioning = NULL) { + options <- list( parse_options = args$parse_options, convert_options = args$convert_options, @@ -223,7 +227,7 @@ check_csv_file_format_args <- function(args) { } if (is.null(args$read_options)) { - options$read_options <- do.call(csv_file_format_read_opts, args) + options$read_options <- do.call(csv_file_format_read_opts, c(args, list(partitioning = partitioning))) } else if (is.list(args$read_options)) { options$read_options <- do.call(CsvReadOptions$create, args$read_options) } @@ -235,7 +239,7 @@ check_unsupported_args <- function(args) { opt_names <- get_opt_names(args) # Filter out arguments meant for CsvConvertOptions/CsvReadOptions - supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na") + supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "quoted_na") supported_read_opts <- c( names(formals(CsvReadOptions$create)), @@ -308,7 +312,8 @@ check_unrecognised_args <- function(opts) { readr_opts <- c( names(formals(readr_to_csv_parse_options)), names(formals(readr_to_csv_read_options)), - "na" + "na", + "quoted_na" ) is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) @@ -339,7 +344,7 @@ check_ambiguous_options <- function(passed_opts, opts1, opts2) { } } -check_schema <- function(schema, column_names) { +check_schema <- function(schema, partitioning, column_names) { if (!is.null(schema) && !inherits(schema, "Schema")) { abort(paste0( "`schema` must be an object of class 'Schema' not '", @@ -348,7 +353,7 @@ check_schema <- function(schema, column_names) { )) } - schema_names <- names(schema) + schema_names <- setdiff(names(schema), names(partitioning)) if (!is.null(schema) && !identical(schema_names, column_names)) { missing_from_schema <- setdiff(column_names, schema_names) @@ -390,7 +395,7 @@ check_schema <- function(schema, column_names) { csv_file_format_parse_opts <- function(...) { opts <- list(...) # Filter out arguments meant for CsvConvertOptions/CsvReadOptions - convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options") + convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "quoted_na", "convert_options") read_opts <- c( names(formals(CsvReadOptions$create)), names(formals(readr_to_csv_read_options)), @@ -448,15 +453,21 @@ csv_file_format_convert_opts <- function(...) { opts[["na"]] <- NULL } + if ("quoted_na" %in% names(opts)) { + opts[["strings_can_be_null"]] <- opts[["quoted_na"]] + opts[["quoted_na"]] <- NULL + } + do.call(CsvConvertOptions$create, opts) } -csv_file_format_read_opts <- function(schema = NULL, ...) { +csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...) { + opts <- list(...) # Filter out arguments meant for CsvParseOptions/CsvConvertOptions arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options") readr_opts <- names(formals(readr_to_csv_parse_options)) - convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options") + convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "quoted_na", "convert_options") opts[arrow_opts] <- NULL opts[readr_opts] <- NULL opts[convert_opts] <- NULL @@ -468,7 +479,6 @@ csv_file_format_read_opts <- function(schema = NULL, ...) { is_arrow_opt <- !is.na(match(opt_names, arrow_opts)) is_readr_opt <- !is.na(match(opt_names, readr_opts)) - check_ambiguous_options(opt_names, arrow_opts, readr_opts) null_or_true <- function(x) { @@ -477,9 +487,9 @@ csv_file_format_read_opts <- function(schema = NULL, ...) { if (!is.null(schema) && null_or_true(opts[["column_names"]]) && null_or_true(opts[["col_names"]])) { if (any(is_readr_opt)) { - opts[["col_names"]] <- names(schema) + opts[["col_names"]] <- setdiff(names(schema), names(partitioning)) } else { - opts[["column_names"]] <- names(schema) + opts[["column_names"]] <- setdiff(names(schema), names(partitioning)) } } diff --git a/r/R/dataset.R b/r/R/dataset.R index b7728ff897fff..9d91839c220bb 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -240,7 +240,6 @@ open_dataset <- function(sources, #' @section Options currently supported by [read_delim_arrow()] which are not supported here: #' * `file` (instead, please specify files in `sources`) #' * `col_select` (instead, subset columns after dataset creation) -#' * `quoted_na` #' * `as_data_frame` (instead, convert to data frame after dataset creation) #' * `parse_options` #' @@ -276,7 +275,8 @@ open_delim_dataset <- function(sources, skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL) { + timestamp_parsers = NULL, + quoted_na = TRUE) { open_dataset( sources = sources, schema = schema, @@ -296,7 +296,8 @@ open_delim_dataset <- function(sources, skip = skip, convert_options = convert_options, read_options = read_options, - timestamp_parsers = timestamp_parsers + timestamp_parsers = timestamp_parsers, + quoted_na = quoted_na ) } @@ -318,7 +319,8 @@ open_csv_dataset <- function(sources, skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL) { + timestamp_parsers = NULL, + quoted_na = TRUE) { mc <- match.call() mc$delim <- "," mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow")) @@ -343,7 +345,8 @@ open_tsv_dataset <- function(sources, skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL) { + timestamp_parsers = NULL, + quoted_na = TRUE) { mc <- match.call() mc$delim <- "\t" mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow")) diff --git a/r/R/schema.R b/r/R/schema.R index 1ad18e314191e..ac0604b2b345c 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -285,6 +285,9 @@ infer_schema.Dataset <- function(x) x$schema #' @export infer_schema.arrow_dplyr_query <- function(x) implicit_schema(x) +#' @export +infer_schema.data.frame <- function(x) schema(!!!lapply(x, infer_type)) + #' @export names.Schema <- function(x) x$names diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd index a18ff959ce7e5..6ebb2355184c1 100644 --- a/r/man/CsvReadOptions.Rd +++ b/r/man/CsvReadOptions.Rd @@ -52,10 +52,10 @@ The order of application is as follows: \itemize{ \item \code{delimiter} Field delimiting character (default \code{","}) \item \code{quoting} Logical: are strings quoted? (default \code{TRUE}) -\item \code{quote_char} Quoting character, if \code{quoting} is \code{TRUE} +\item \code{quote_char} Quoting character, if \code{quoting} is \code{TRUE} (default \code{'"'}) \item \code{double_quote} Logical: are quotes inside values double-quoted? (default \code{TRUE}) \item \code{escaping} Logical: whether escaping is used (default \code{FALSE}) -\item \code{escape_char} Escaping character, if \code{escaping} is \code{TRUE} +\item \code{escape_char} Escaping character, if \code{escaping} is \code{TRUE} (default \code{"\\\\"}) \item \code{newlines_in_values} Logical: are values allowed to contain CR (\code{0x0d}) and LF (\code{0x0a}) characters? (default \code{FALSE}) \item \code{ignore_empty_lines} Logical: should empty lines be ignored (default) or diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd index 2bfd047040a8b..cf08302cc6436 100644 --- a/r/man/open_delim_dataset.Rd +++ b/r/man/open_delim_dataset.Rd @@ -24,7 +24,8 @@ open_delim_dataset( skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL + timestamp_parsers = NULL, + quoted_na = TRUE ) open_csv_dataset( @@ -44,7 +45,8 @@ open_csv_dataset( skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL + timestamp_parsers = NULL, + quoted_na = TRUE ) open_tsv_dataset( @@ -64,7 +66,8 @@ open_tsv_dataset( skip = 0L, convert_options = NULL, read_options = NULL, - timestamp_parsers = NULL + timestamp_parsers = NULL, + quoted_na = TRUE ) } \arguments{ @@ -178,6 +181,11 @@ starting from the beginning of this vector. Possible values are: \item a character vector of \link[base:strptime]{strptime} parse strings \item a list of \link{TimestampParser} objects }} + +\item{quoted_na}{Should missing values inside quotes be treated as missing +values (the default) or strings. (Note that this is different from the +the Arrow C++ default for the corresponding convert option, +\code{strings_can_be_null}.)} } \description{ A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}}, @@ -189,7 +197,6 @@ for opening single files and functions for opening datasets. \itemize{ \item \code{file} (instead, please specify files in \code{sources}) \item \code{col_select} (instead, subset columns after dataset creation) -\item \code{quoted_na} \item \code{as_data_frame} (instead, convert to data frame after dataset creation) \item \code{parse_options} } diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index 31f393a27785d..8ba513a98c85b 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,8 @@ -

12.0.1.9000 (dev)

-

12.0.1.1 (release)

+

13.0.0.9000 (dev)

+

13.0.0.1 (release)

+

12.0.1.1

11.0.0.3

10.0.1

9.0.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 565f67b9730a4..b7c6984e3c660 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -4,7 +4,7 @@ "version": "dev/" }, { - "name": "13.0.0 (release)", + "name": "13.0.0.1 (release)", "version": "" }, { diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index 6812a3eec0a4e..8d39f7252ee21 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -34,6 +34,7 @@ Sys.setenv(LANGUAGE = "en") options(arrow.pull_as_vector = FALSE) with_language <- function(lang, expr) { + skip_on_cran() old <- Sys.getenv("LANGUAGE") # Check what this message is before changing languages; this will # trigger caching the transations if the OS does that (some do). diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index c83c30ff904ff..e8e7c61fc8848 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -220,7 +220,7 @@ test_that("readr parse options", { # With not yet supported readr parse options expect_error( - open_dataset(tsv_dir, partitioning = "part", delim = "\t", quoted_na = TRUE), + open_dataset(tsv_dir, partitioning = "part", delim = "\t", col_select = "integer"), "supported" ) @@ -253,7 +253,7 @@ test_that("readr parse options", { tsv_dir, partitioning = "part", format = "text", - quo = "\"", + del = "," ), "Ambiguous" ) @@ -561,6 +561,16 @@ test_that("open_delim_dataset params passed through to open_dataset", { expect_named(ds, c("int", "dbl", "lgl", "chr", "fct", "ts")) + # quoted_na + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + writeLines("text,num\none,1\ntwo,2\n,3\nfour,4", dst_file) + ds <- open_csv_dataset(dst_dir, quoted_na = TRUE) %>% collect() + expect_equal(ds$text, c("one", "two", NA, "four")) + + ds <- open_csv_dataset(dst_dir, quoted_na = FALSE) %>% collect() + expect_equal(ds$text, c("one", "two", "", "four")) + # timestamp_parsers skip("GH-33708: timestamp_parsers don't appear to be working properly") @@ -593,3 +603,37 @@ test_that("CSVReadOptions field access", { expect_equal(options$block_size, 1048576L) expect_equal(options$encoding, "UTF-8") }) + +test_that("GH-34640 - CSV datasets are read in correctly when both schema and partitioning supplied", { + target_schema <- schema( + int = int32(), dbl = float32(), lgl = bool(), chr = utf8(), + fct = utf8(), ts = timestamp(unit = "s"), part = int8() + ) + + ds <- open_dataset( + csv_dir, + partitioning = schema(part = int32()), + format = "csv", + schema = target_schema, + skip = 1 + ) + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_identical(names(ds), c(names(df1), "part")) + expect_identical(names(collect(ds)), c(names(df1), "part")) + + expect_identical(dim(ds), c(20L, 7L)) + expect_equal(schema(ds), target_schema) + + expect_equal( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 5) %>% + collect() %>% + summarize(mean = mean(as.numeric(integer))), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index db91cee330960..15342add38fae 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -295,9 +295,18 @@ test_that("schema name assignment", { test_that("schema extraction", { skip_if_not_available("dataset") + tbl <- arrow_table(example_data) + expect_equal(schema(example_data), tbl$schema) expect_equal(schema(tbl), tbl$schema) + expect_equal( + schema(data.frame(a = 1, a = "x", check.names = FALSE, stringsAsFactors = FALSE)), + schema(a = double(), a = string()) + ) + + expect_equal(schema(data.frame()), schema()) + ds <- InMemoryDataset$create(example_data) expect_equal(schema(ds), ds$schema)