From 3ac0959ac168caebb19dfbfbc8881323e694a4ae Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sun, 26 Jun 2022 09:43:31 -0400 Subject: [PATCH] ARROW-16510: [R] Add bindings for GCS filesystem (#13404) This adds basic bindings for GcsFileSystem to R, turns it on in the macOS, Windows, and Linux packaging (same handling as ARROW_S3), and basic R tests. Followups: - Bindings for FromImpersonatedServiceAccount (ARROW-16885) - Set up testbench for fuller tests, like how we do with minio (ARROW-16879) - GcsFileSystem::Make should return Result (ARROW-16884) - Explore auth integration/compatibility with `gargle`, `googleAuthR`, etc.: can we pick up the same credentials they use (ARROW-16880) - macOS binary packaging: push dependencies upstream (ARROW-16883) - Windows binary packaging: push dependencies upstream (ARROW-16878) - Update cloud/filesystem documentation (ARROW-16887) Lead-authored-by: Neal Richardson Co-authored-by: Sutou Kouhei Signed-off-by: Neal Richardson --- .github/workflows/cpp.yml | 8 +- .github/workflows/r.yml | 2 +- ci/scripts/PKGBUILD | 5 + ci/scripts/r_windows_build.sh | 6 +- ...google-cloud-cpp-curl-static-windows.patch | 31 ++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 276 +++++++++++------- cpp/src/arrow/filesystem/gcsfs.h | 1 + cpp/src/arrow/filesystem/type_fwd.h | 1 + cpp/thirdparty/versions.txt | 4 +- .../autobrew/apache-arrow.rb | 1 + dev/tasks/r/github.macos.brew.yml | 2 + dev/tasks/tasks.yml | 2 +- r/R/arrow-info.R | 11 +- r/R/arrowExports.R | 5 + r/R/filesystem.R | 79 ++++- r/configure | 36 ++- r/configure.win | 13 +- r/data-raw/codegen.R | 63 ++-- r/inst/build_arrow_static.sh | 1 + r/src/arrowExports.cpp | 27 ++ r/src/filesystem.cpp | 81 +++++ r/tests/testthat/test-gcs.R | 60 ++++ r/tools/autobrew | 1 + r/tools/nixlibs.R | 42 ++- r/vignettes/developers/setup.Rmd | 2 + r/vignettes/install.Rmd | 101 +++---- 26 files changed, 627 insertions(+), 234 deletions(-) create mode 100644 cpp/build-support/google-cloud-cpp-curl-static-windows.patch mode change 100644 => 100755 r/configure.win create mode 100644 r/tests/testthat/test-gcs.R diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index b914b7df52f6b..acb3270a5d5bf 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -276,8 +276,12 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON - # google-could-cpp uses _dupenv_s() but it can't be used with msvcrt. - # We need to use ucrt to use _dupenv_s(). + # With GCS on, + # * MinGW 32 build OOMs (maybe turn off unity build?) + # * MinGW 64 fails to compile the GCS filesystem tests, some conflict + # with boost. First error says: + # D:/a/_temp/msys64/mingw64/include/boost/asio/detail/socket_types.hpp:24:4: error: #error WinSock.h has already been included + # TODO(ARROW-16906) # ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 48d9672c74bfc..86e006d538552 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -165,7 +165,7 @@ jobs: name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }} runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 + timeout-minutes: 90 strategy: fail-fast: false matrix: diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b9b0194f5c8cf..ea17fba17edd0 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -25,6 +25,7 @@ arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" + "${MINGW_PACKAGE_PREFIX}-curl" # for google-cloud-cpp bundled build "${MINGW_PACKAGE_PREFIX}-libutf8proc" "${MINGW_PACKAGE_PREFIX}-re2" "${MINGW_PACKAGE_PREFIX}-thrift" @@ -79,11 +80,13 @@ build() { export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" export LIBS="-L${MINGW_PREFIX}/libs" + export ARROW_GCS=OFF export ARROW_S3=OFF export ARROW_WITH_RE2=OFF # Without this, some dataset functionality segfaults export CMAKE_UNITY_BUILD=ON else + export ARROW_GCS=ON export ARROW_S3=ON export ARROW_WITH_RE2=ON # Without this, some compute functionality segfaults in tests @@ -101,6 +104,7 @@ build() { -DARROW_CSV=ON \ -DARROW_DATASET=ON \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS="${ARROW_GCS}" \ -DARROW_HDFS=OFF \ -DARROW_JEMALLOC=OFF \ -DARROW_JSON=ON \ @@ -112,6 +116,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_UTF8PROC_USE_SHARED=OFF \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_LZ4=ON \ -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ -DARROW_WITH_SNAPPY=ON \ diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index 89d5737a09bd0..3334eab8663a8 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -87,7 +87,7 @@ if [ -d mingw64/lib/ ]; then # These may be from https://dl.bintray.com/rtools/backports/ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 + cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 fi # Same for the 32-bit versions @@ -97,7 +97,7 @@ if [ -d mingw32/lib/ ]; then mkdir -p $DST_DIR/lib/i386 mv mingw32/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 + cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 fi # Do the same also for ucrt64 @@ -105,7 +105,7 @@ if [ -d ucrt64/lib/ ]; then ls $MSYS_LIB_DIR/ucrt64/lib/ mkdir -p $DST_DIR/lib/x64-ucrt mv ucrt64/lib/*.a $DST_DIR/lib/x64-ucrt - cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt + cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt fi # Create build artifact diff --git a/cpp/build-support/google-cloud-cpp-curl-static-windows.patch b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch new file mode 100644 index 0000000000000..e3f849ceda1c2 --- /dev/null +++ b/cpp/build-support/google-cloud-cpp-curl-static-windows.patch @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff -ru google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake +--- google_cloud_cpp_ep.orig/cmake/FindCurlWithTargets.cmake 2022-04-05 06:00:53.000000000 +0900 ++++ google_cloud_cpp_ep/cmake/FindCurlWithTargets.cmake 2022-06-24 10:06:00.177969962 +0900 +@@ -68,6 +68,10 @@ + TARGET CURL::libcurl + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES crypt32 wsock32 ws2_32) ++ set_property( ++ TARGET CURL::libcurl ++ APPEND ++ PROPERTY INTERFACE_COMPILE_DEFINITIONS "CURL_STATICLIB") + endif () + if (APPLE) + set_property( diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 2abbb52b52907..b50b6f7983604 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -324,6 +324,7 @@ endif() if(ARROW_GCS) set(ARROW_WITH_GOOGLE_CLOUD_CPP ON) set(ARROW_WITH_NLOHMANN_JSON ON) + set(ARROW_WITH_ZLIB ON) endif() if(ARROW_JSON) @@ -475,7 +476,7 @@ else() endif() if(DEFINED ENV{ARROW_CRC32C_URL}) - set(CRC32C_URL "$ENV{ARROW_CRC32C_URL}") + set(CRC32C_SOURCE_URL "$ENV{ARROW_CRC32C_URL}") else() set_urls(CRC32C_SOURCE_URL "https://github.com/google/crc32c/archive/${ARROW_CRC32C_BUILD_VERSION}.tar.gz" @@ -2704,15 +2705,16 @@ macro(resolve_dependency_absl) stacktrace status statusor - strerror str_format_internal + strerror strings strings_internal symbolize synchronization throw_delegate time - time_zone) + time_zone + wyhash) # Abseil creates a number of header-only targets, which are needed to resolve dependencies. # The list can be refreshed using: # comm -13 <(ls -l $PREFIX/lib/libabsl_*.a | sed -e 's/.*libabsl_//' -e 's/.a$//' | sort -u) \ @@ -2769,8 +2771,8 @@ macro(resolve_dependency_absl) pretty_function random_bit_gen_ref random_internal_distribution_caller - random_internal_fastmath random_internal_fast_uniform_bits + random_internal_fastmath random_internal_generate_real random_internal_iostream_state_saver random_internal_mock_helpers @@ -2808,18 +2810,17 @@ macro(resolve_dependency_absl) endforeach() # Extracted the dependency information using the Abseil pkg-config files: - # grep Requires $PREFIX/pkgconfig/absl_*.pc | \ + # grep Requires $PREFIX/lib/pkgconfig/absl_*.pc | \ # sed -e 's;.*/absl_;set_property(TARGET absl::;' \ # -e 's/.pc:Requires:/ PROPERTY INTERFACE_LINK_LIBRARIES /' \ - # -e 's/ = 20210324,//g' \ - # -e 's/ = 20210324//g' \ + # -E -e 's/ = 20[0-9]{6},?//g' \ # -e 's/absl_/absl::/g' \ # -e 's/$/)/' | \ # grep -v 'INTERFACE_LINK_LIBRARIES[ ]*)' + set_property(TARGET absl::algorithm PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::algorithm_container PROPERTY INTERFACE_LINK_LIBRARIES absl::algorithm absl::core_headers absl::meta) - set_property(TARGET absl::algorithm PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::any PROPERTY INTERFACE_LINK_LIBRARIES absl::bad_any_cast @@ -2830,19 +2831,17 @@ macro(resolve_dependency_absl) absl::utility) set_property(TARGET absl::atomic_hook PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers) + set_property(TARGET absl::bad_any_cast PROPERTY INTERFACE_LINK_LIBRARIES + absl::bad_any_cast_impl absl::config) set_property(TARGET absl::bad_any_cast_impl PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) - set_property(TARGET absl::bad_any_cast PROPERTY INTERFACE_LINK_LIBRARIES - absl::bad_any_cast_impl absl::config) set_property(TARGET absl::bad_optional_access PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) set_property(TARGET absl::bad_variant_access PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::raw_logging_internal) - set_property(TARGET absl::base_internal PROPERTY INTERFACE_LINK_LIBRARIES - absl::config absl::type_traits) set_property(TARGET absl::base PROPERTY INTERFACE_LINK_LIBRARIES absl::atomic_hook @@ -2854,6 +2853,8 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::spinlock_wait absl::type_traits) + set_property(TARGET absl::base_internal PROPERTY INTERFACE_LINK_LIBRARIES + absl::config absl::type_traits) set_property(TARGET absl::bind_front PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::compressed_tuple) @@ -2874,12 +2875,12 @@ macro(resolve_dependency_absl) absl::utility) set_property(TARGET absl::city PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers absl::endian) - set_property(TARGET absl::cleanup_internal - PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::core_headers - absl::utility) set_property(TARGET absl::cleanup PROPERTY INTERFACE_LINK_LIBRARIES absl::cleanup_internal absl::config absl::core_headers) + set_property(TARGET absl::cleanup_internal + PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::core_headers + absl::utility) set_property(TARGET absl::compare PROPERTY INTERFACE_LINK_LIBRARIES absl::core_headers absl::type_traits) set_property(TARGET absl::compressed_tuple PROPERTY INTERFACE_LINK_LIBRARIES @@ -2892,19 +2893,6 @@ macro(resolve_dependency_absl) absl::memory absl::type_traits absl::utility) - set_property(TARGET absl::cord_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::base_internal - absl::compressed_tuple - absl::config - absl::core_headers - absl::endian - absl::inlined_vector - absl::layout - absl::raw_logging_internal - absl::strings - absl::throw_delegate - absl::type_traits) set_property(TARGET absl::cord PROPERTY INTERFACE_LINK_LIBRARIES absl::base @@ -2923,6 +2911,19 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::strings absl::type_traits) + set_property(TARGET absl::cord_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::base_internal + absl::compressed_tuple + absl::config + absl::core_headers + absl::endian + absl::inlined_vector + absl::layout + absl::raw_logging_internal + absl::strings + absl::throw_delegate + absl::type_traits) set_property(TARGET absl::cordz_functions PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -2971,6 +2972,8 @@ macro(resolve_dependency_absl) set_property(TARGET absl::core_headers PROPERTY INTERFACE_LINK_LIBRARIES absl::config) set_property(TARGET absl::counting_allocator PROPERTY INTERFACE_LINK_LIBRARIES absl::config) + set_property(TARGET absl::debugging PROPERTY INTERFACE_LINK_LIBRARIES + absl::stacktrace absl::leak_check) set_property(TARGET absl::debugging_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::core_headers @@ -2978,8 +2981,6 @@ macro(resolve_dependency_absl) absl::dynamic_annotations absl::errno_saver absl::raw_logging_internal) - set_property(TARGET absl::debugging PROPERTY INTERFACE_LINK_LIBRARIES - absl::stacktrace absl::leak_check) set_property(TARGET absl::demangle_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::base absl::core_headers) set_property(TARGET absl::dynamic_annotations PROPERTY INTERFACE_LINK_LIBRARIES @@ -3015,8 +3016,16 @@ macro(resolve_dependency_absl) absl::dynamic_annotations absl::throw_delegate absl::memory) - set_property(TARGET absl::flags_commandlineflag_internal - PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::fast_type_id) + set_property(TARGET absl::flags + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::flags_commandlineflag + absl::flags_config + absl::flags_internal + absl::flags_reflection + absl::base + absl::core_headers + absl::strings) set_property(TARGET absl::flags_commandlineflag PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3024,6 +3033,8 @@ macro(resolve_dependency_absl) absl::flags_commandlineflag_internal absl::optional absl::strings) + set_property(TARGET absl::flags_commandlineflag_internal + PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::fast_type_id) set_property(TARGET absl::flags_config PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3067,16 +3078,6 @@ macro(resolve_dependency_absl) absl::synchronization) set_property(TARGET absl::flags_path_util PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::strings) - set_property(TARGET absl::flags - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::flags_commandlineflag - absl::flags_config - absl::flags_internal - absl::flags_reflection - absl::base - absl::core_headers - absl::strings) set_property(TARGET absl::flags_private_handle_accessor PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3099,6 +3100,13 @@ macro(resolve_dependency_absl) absl::strings absl::synchronization absl::flat_hash_map) + set_property(TARGET absl::flags_usage + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::core_headers + absl::flags_usage_internal + absl::strings + absl::synchronization) set_property(TARGET absl::flags_usage_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::config @@ -3113,13 +3121,6 @@ macro(resolve_dependency_absl) absl::flat_hash_map absl::strings absl::synchronization) - set_property(TARGET absl::flags_usage - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::core_headers - absl::flags_usage_internal - absl::strings - absl::synchronization) set_property(TARGET absl::flat_hash_map PROPERTY INTERFACE_LINK_LIBRARIES absl::container_memory @@ -3146,12 +3147,6 @@ macro(resolve_dependency_absl) absl::core_headers absl::malloc_internal absl::raw_logging_internal) - set_property(TARGET absl::hash_function_defaults - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::cord - absl::hash - absl::strings) set_property(TARGET absl::hash PROPERTY INTERFACE_LINK_LIBRARIES absl::city @@ -3166,12 +3161,18 @@ macro(resolve_dependency_absl) absl::variant absl::utility absl::low_level_hash) + set_property(TARGET absl::hash_function_defaults + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::cord + absl::hash + absl::strings) set_property(TARGET absl::hash_policy_traits PROPERTY INTERFACE_LINK_LIBRARIES absl::meta) - set_property(TARGET absl::hashtable_debug_hooks PROPERTY INTERFACE_LINK_LIBRARIES - absl::config) set_property(TARGET absl::hashtable_debug PROPERTY INTERFACE_LINK_LIBRARIES absl::hashtable_debug_hooks) + set_property(TARGET absl::hashtable_debug_hooks PROPERTY INTERFACE_LINK_LIBRARIES + absl::config) set_property(TARGET absl::hashtablez_sampler PROPERTY INTERFACE_LINK_LIBRARIES absl::base @@ -3179,13 +3180,6 @@ macro(resolve_dependency_absl) absl::have_sse absl::sample_recorder absl::synchronization) - set_property(TARGET absl::inlined_vector_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::compressed_tuple - absl::core_headers - absl::memory - absl::span - absl::type_traits) set_property(TARGET absl::inlined_vector PROPERTY INTERFACE_LINK_LIBRARIES absl::algorithm @@ -3193,6 +3187,13 @@ macro(resolve_dependency_absl) absl::inlined_vector_internal absl::throw_delegate absl::memory) + set_property(TARGET absl::inlined_vector_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::compressed_tuple + absl::core_headers + absl::memory + absl::span + absl::type_traits) set_property(TARGET absl::int128 PROPERTY INTERFACE_LINK_LIBRARIES absl::config absl::core_headers absl::bits) set_property(TARGET absl::kernel_timeout_internal @@ -3291,10 +3292,10 @@ macro(resolve_dependency_absl) absl::strings absl::str_format absl::span) - set_property(TARGET absl::random_internal_fastmath PROPERTY INTERFACE_LINK_LIBRARIES - absl::bits) set_property(TARGET absl::random_internal_fast_uniform_bits PROPERTY INTERFACE_LINK_LIBRARIES absl::config) + set_property(TARGET absl::random_internal_fastmath PROPERTY INTERFACE_LINK_LIBRARIES + absl::bits) set_property(TARGET absl::random_internal_generate_real PROPERTY INTERFACE_LINK_LIBRARIES absl::bits @@ -3335,6 +3336,10 @@ macro(resolve_dependency_absl) absl::random_seed_gen_exception absl::raw_logging_internal absl::span) + set_property(TARGET absl::random_internal_randen + PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform + absl::random_internal_randen_hwaes + absl::random_internal_randen_slow) set_property(TARGET absl::random_internal_randen_engine PROPERTY INTERFACE_LINK_LIBRARIES absl::endian @@ -3342,16 +3347,12 @@ macro(resolve_dependency_absl) absl::random_internal_randen absl::raw_logging_internal absl::type_traits) - set_property(TARGET absl::random_internal_randen_hwaes_impl - PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform - absl::config) set_property(TARGET absl::random_internal_randen_hwaes PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform absl::random_internal_randen_hwaes_impl absl::config) - set_property(TARGET absl::random_internal_randen + set_property(TARGET absl::random_internal_randen_hwaes_impl PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform - absl::random_internal_randen_hwaes - absl::random_internal_randen_slow) + absl::config) set_property(TARGET absl::random_internal_randen_slow PROPERTY INTERFACE_LINK_LIBRARIES absl::random_internal_platform absl::config) @@ -3439,16 +3440,6 @@ macro(resolve_dependency_absl) set_property(TARGET absl::stacktrace PROPERTY INTERFACE_LINK_LIBRARIES absl::debugging_internal absl::config absl::core_headers) - set_property(TARGET absl::statusor - PROPERTY INTERFACE_LINK_LIBRARIES - absl::base - absl::status - absl::core_headers - absl::raw_logging_internal - absl::type_traits - absl::strings - absl::utility - absl::variant) set_property(TARGET absl::status PROPERTY INTERFACE_LINK_LIBRARIES absl::atomic_hook @@ -3463,8 +3454,18 @@ macro(resolve_dependency_absl) absl::cord absl::str_format absl::optional) - set_property(TARGET absl::strerror PROPERTY INTERFACE_LINK_LIBRARIES absl::config - absl::core_headers absl::errno_saver) + set_property(TARGET absl::statusor + PROPERTY INTERFACE_LINK_LIBRARIES + absl::base + absl::status + absl::core_headers + absl::raw_logging_internal + absl::type_traits + absl::strings + absl::utility + absl::variant) + set_property(TARGET absl::str_format PROPERTY INTERFACE_LINK_LIBRARIES + absl::str_format_internal) set_property(TARGET absl::str_format_internal PROPERTY INTERFACE_LINK_LIBRARIES absl::bits @@ -3475,15 +3476,8 @@ macro(resolve_dependency_absl) absl::type_traits absl::int128 absl::span) - set_property(TARGET absl::str_format PROPERTY INTERFACE_LINK_LIBRARIES - absl::str_format_internal) - set_property(TARGET absl::strings_internal - PROPERTY INTERFACE_LINK_LIBRARIES - absl::config - absl::core_headers - absl::endian - absl::raw_logging_internal - absl::type_traits) + set_property(TARGET absl::strerror PROPERTY INTERFACE_LINK_LIBRARIES absl::config + absl::core_headers absl::errno_saver) set_property(TARGET absl::strings PROPERTY INTERFACE_LINK_LIBRARIES absl::strings_internal @@ -3497,6 +3491,13 @@ macro(resolve_dependency_absl) absl::raw_logging_internal absl::throw_delegate absl::type_traits) + set_property(TARGET absl::strings_internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::config + absl::core_headers + absl::endian + absl::raw_logging_internal + absl::type_traits) set_property(TARGET absl::symbolize PROPERTY INTERFACE_LINK_LIBRARIES absl::debugging_internal @@ -3547,6 +3548,8 @@ macro(resolve_dependency_absl) absl::core_headers absl::type_traits absl::utility) + set_property(TARGET absl::wyhash PROPERTY INTERFACE_LINK_LIBRARIES absl::config + absl::endian absl::int128) if(APPLE) # This is due to upstream absl::cctz issue @@ -3556,18 +3559,6 @@ macro(resolve_dependency_absl) APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${CoreFoundation}) endif() - set_property(TARGET absl::type_traits PROPERTY INTERFACE_LINK_LIBRARIES absl::config) - set_property(TARGET absl::utility - PROPERTY INTERFACE_LINK_LIBRARIES absl::base_internal absl::config - absl::type_traits) - set_property(TARGET absl::variant - PROPERTY INTERFACE_LINK_LIBRARIES - absl::bad_variant_access - absl::base_internal - absl::config - absl::core_headers - absl::type_traits - absl::utility) externalproject_add(absl_ep ${EP_LOG_OPTIONS} @@ -4020,10 +4011,25 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_storage${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL + "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_rest_internal${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_common${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(GOOGLE_CLOUD_CPP_PATCH_COMMAND) + if(CMAKE_VERSION VERSION_GREATER 3.9) + find_package(Patch) + if(Patch_FOUND) + # This patch is for google-cloud-cpp <= 1.42.0 + # Upstreamed: https://github.com/googleapis/google-cloud-cpp/pull/9345 + set(GOOGLE_CLOUD_CPP_PATCH_COMMAND + ${Patch_EXECUTABLE} "/cmake/FindCurlWithTargets.cmake" + "${CMAKE_SOURCE_DIR}/build-support/google-cloud-cpp-curl-static-windows.patch") + endif() + endif() externalproject_add(google_cloud_cpp_ep ${EP_LOG_OPTIONS} LIST_SEPARATOR ${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST_SEP_CHAR} @@ -4031,7 +4037,9 @@ macro(build_google_cloud_cpp_storage) URL ${google_cloud_cpp_storage_SOURCE_URL} URL_HASH "SHA256=${ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${GOOGLE_CLOUD_CPP_CMAKE_ARGS} + PATCH_COMMAND ${GOOGLE_CLOUD_CPP_PATCH_COMMAND} BUILD_BYPRODUCTS ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE} + ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL} ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON} DEPENDS google_cloud_cpp_dependencies) @@ -4046,25 +4054,47 @@ macro(build_google_cloud_cpp_storage) "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON}" INTERFACE_INCLUDE_DIRECTORIES "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + # Refer to https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/google_cloud_cpp_common.cmake + # (subsitute `main` for the SHA of the version we use) + # Version 1.39.0 is at a different place (they refactored after): + # https://github.com/googleapis/google-cloud-cpp/blob/29e5af8ca9b26cec62106d189b50549f4dc1c598/google/cloud/CMakeLists.txt#L146-L155 set_property(TARGET google-cloud-cpp::common PROPERTY INTERFACE_LINK_LIBRARIES - absl::any - absl::flat_hash_map + absl::base absl::memory absl::optional + absl::span absl::time + absl::variant Threads::Threads OpenSSL::Crypto) + add_library(google-cloud-cpp::rest-internal STATIC IMPORTED) + set_target_properties(google-cloud-cpp::rest-internal + PROPERTIES IMPORTED_LOCATION + "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL}" + INTERFACE_INCLUDE_DIRECTORIES + "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + set_property(TARGET google-cloud-cpp::rest-internal + PROPERTY INTERFACE_LINK_LIBRARIES + absl::span + google-cloud-cpp::common + CURL::libcurl + nlohmann_json::nlohmann_json + OpenSSL::SSL + OpenSSL::Crypto) + add_library(google-cloud-cpp::storage STATIC IMPORTED) set_target_properties(google-cloud-cpp::storage PROPERTIES IMPORTED_LOCATION "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}" INTERFACE_INCLUDE_DIRECTORIES "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + # Update this from https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/storage/google_cloud_cpp_storage.cmake set_property(TARGET google-cloud-cpp::storage PROPERTY INTERFACE_LINK_LIBRARIES google-cloud-cpp::common + google-cloud-cpp::rest-internal absl::memory absl::strings absl::str_format @@ -4075,11 +4105,39 @@ macro(build_google_cloud_cpp_storage) CURL::libcurl Threads::Threads OpenSSL::SSL - OpenSSL::Crypto) + OpenSSL::Crypto + ZLIB::ZLIB) add_dependencies(google-cloud-cpp::storage google_cloud_cpp_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS google-cloud-cpp::storage + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + google-cloud-cpp::storage + google-cloud-cpp::rest-internal google-cloud-cpp::common) + if(ABSL_VENDORED) + # Figure out what absl libraries (not header-only) are required by the + # google-cloud-cpp libraries above and add them to the bundled_dependencies + # + # pkg-config --libs absl_memory absl_strings absl_str_format absl_time absl_variant absl_base absl_memory absl_optional absl_span absl_time absl_variant + # (and then some regexing) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + absl::bad_optional_access + absl::bad_variant_access + absl::base + absl::civil_time + absl::int128 + absl::log_severity + absl::raw_logging_internal + absl::spinlock_wait + absl::strings + absl::strings_internal + absl::str_format_internal + absl::throw_delegate + absl::time + absl::time_zone + Crc32c::crc32c) + endif() endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 8458c7f2108bd..77b8a0b201a8c 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -218,6 +218,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { const std::shared_ptr& metadata) override; /// Create a GcsFileSystem instance from the given options. + // TODO(ARROW-16884): make this return Result for consistency static std::shared_ptr Make( const GcsOptions& options, const io::IOContext& = io::default_io_context()); diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h index 112563577db08..c6427dc3c8643 100644 --- a/cpp/src/arrow/filesystem/type_fwd.h +++ b/cpp/src/arrow/filesystem/type_fwd.h @@ -44,6 +44,7 @@ class SubTreeFileSystem; class SlowFileSystem; class LocalFileSystem; class S3FileSystem; +class GcsFileSystem; } // namespace fs } // namespace arrow diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 3b4b4749add16..7dc95cd7e0968 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -49,8 +49,8 @@ ARROW_GFLAGS_BUILD_VERSION=v2.2.2 ARROW_GFLAGS_BUILD_SHA256_CHECKSUM=34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf ARROW_GLOG_BUILD_VERSION=v0.5.0 ARROW_GLOG_BUILD_SHA256_CHECKSUM=eede71f28371bf39aa69b45de23b329d37214016e2055269b3b5e7cfd40b59f5 -ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.39.0 -ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=73e4e840018b24bec2beb49e036a3c2d8c471d4dc4a18b9026ccc4d8ab8e78cc +ARROW_GOOGLE_CLOUD_CPP_BUILD_VERSION=v1.42.0 +ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM=c06ae9aededbb8aa217a6d2453754daa40b815f9a4004bc4f2d2d215c79828aa ARROW_GRPC_BUILD_VERSION=v1.46.3 ARROW_GRPC_BUILD_SHA256_CHECKSUM=d6cbf22cb5007af71b61c6be316a79397469c58c82a942552a62e708bce60964 ARROW_GTEST_BUILD_VERSION=1.11.0 diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb index d72f64c92e1d0..45c04463b6d0d 100644 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb @@ -47,6 +47,7 @@ def install -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_FILESYSTEM=ON + -DARROW_GCS=ON -DARROW_HDFS=OFF -DARROW_JEMALLOC=ON -DARROW_JSON=ON diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index 064ab550d4128..a403a65595450 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -30,6 +30,8 @@ jobs: - name: Install apache-arrow run: | + # TODO(ARROW-16907): apache/arrow@master seems to be installed already + # so this does nothing on a branch/PR brew install -v --HEAD apache-arrow # for testing brew install minio diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 73e2257f1988e..43dccba66ebff 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1305,7 +1305,7 @@ tasks: ci: github template: docker-tests/github.linux.yml params: - flags: '-e ARROW_DEPENDENCY_SOURCE=SYSTEM -e xsimd_SOURCE=BUNDLED' + flags: '-e ARROW_DEPENDENCY_SOURCE=SYSTEM -e ARROW_GCS=OFF -e xsimd_SOURCE=BUNDLED' image: ubuntu-r-only-r test-r-offline-minimal: diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 28afe75e6d6e8..55d07b77cb4a0 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -44,6 +44,7 @@ arrow_info <- function() { parquet = arrow_with_parquet(), json = arrow_with_json(), s3 = arrow_with_s3(), + gcs = arrow_with_gcs(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) @@ -116,6 +117,14 @@ arrow_with_s3 <- function() { }) } +#' @rdname arrow_info +#' @export +arrow_with_gcs <- function() { + tryCatch(.Call(`_gcs_available`), error = function(e) { + return(FALSE) + }) +} + #' @rdname arrow_info #' @export arrow_with_json <- function() { @@ -150,7 +159,7 @@ print.arrow_info <- function(x, ...) { mimalloc = "mimalloc" %in% x$memory_pool$available_backends )) if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) { - # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris + # Only on linux because (e.g.) we disable certain features on purpose on rtools35 cat( "To reinstall with more optional capabilities enabled, see\n", " https://arrow.apache.org/docs/r/articles/install.html\n\n" diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 4c579840e4913..bf5a8d0682181 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1288,6 +1288,10 @@ fs___S3FileSystem__region <- function(fs) { .Call(`_arrow_fs___S3FileSystem__region`, fs) } +fs___GcsFileSystem__Make <- function(anonymous, options) { + .Call(`_arrow_fs___GcsFileSystem__Make`, anonymous, options) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } @@ -2007,3 +2011,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/R/filesystem.R b/r/R/filesystem.R index b035430ff6589..75997431a434f 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -269,7 +269,20 @@ FileSystem <- R6Class("FileSystem", } ), active = list( - type_name = function() fs___FileSystem__type_name(self) + type_name = function() fs___FileSystem__type_name(self), + url_scheme = function() { + fs_type_name <- self$type_name + if (identical(fs_type_name, "subtree")) { + # Recurse + return(self$base_fs$url_scheme) + } + # Some type_names are the url scheme but others aren't + type_map <- list( + local = "file", + gcs = "gs" + ) + type_map[[fs_type_name]] %||% fs_type_name + } ) ) FileSystem$from_uri <- function(uri) { @@ -435,6 +448,58 @@ s3_bucket <- function(bucket, ...) { SubTreeFileSystem$create(fs_and_path$path, fs) } +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +GcsFileSystem <- R6Class("GcsFileSystem", + inherit = FileSystem +) +GcsFileSystem$create <- function(anonymous = FALSE, ...) { + options <- list(...) + + # Validate options + if (isTRUE(anonymous)) { + invalid_args <- intersect( + c("access_token", "expiration", "json_credentials"), + names(options) + ) + if (length(invalid_args)) { + stop( + "Cannot specify ", + oxford_paste(invalid_args), + " when anonymous = TRUE", + call. = FALSE + ) + } + } else { + token_args <- intersect(c("access_token", "expiration"), names(options)) + if (!is.null(options[["json_credentials"]]) && length(token_args) > 0) { + stop("Cannot provide access_token with json_credentials", call. = FALSE) + } else if (length(token_args) == 1) { + stop("token auth requires both 'access_token' and 'expiration'", call. = FALSE) + } + } + + valid_opts <- c( + "access_token", "expiration", "json_credentials", "endpoint_override", + "scheme", "default_bucket_location", "retry_limit_seconds", + "default_metadata" + ) + + invalid_opts <- setdiff(names(options), valid_opts) + if (length(invalid_opts)) { + stop( + "Invalid options for GcsFileSystem: ", + oxford_paste(invalid_opts), + call. = FALSE + ) + } + + fs___GcsFileSystem__Make(anonymous, options) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem @@ -443,13 +508,11 @@ SubTreeFileSystem <- R6Class("SubTreeFileSystem", inherit = FileSystem, public = list( print = function(...) { - if (inherits(self$base_fs, "LocalFileSystem")) { - cat("SubTreeFileSystem: ", "file://", self$base_path, "\n", sep = "") - } else if (inherits(self$base_fs, "S3FileSystem")) { - cat("SubTreeFileSystem: ", "s3://", self$base_path, "\n", sep = "") - } else { - cat("SubTreeFileSystem", "\n", sep = "") - } + cat( + "SubTreeFileSystem: ", + self$url_scheme, "://", self$base_path, "\n", + sep = "" + ) invisible(self) } ), diff --git a/r/configure b/r/configure index 30fa4bff1e12b..d62c58eedae1c 100755 --- a/r/configure +++ b/r/configure @@ -229,44 +229,48 @@ if [ $? -eq 0 ]; then # Check for features LIB_DIR=`echo $PKG_DIRS | sed -e 's/^-L//'` ARROW_OPTS_CMAKE="$LIB_DIR/cmake/arrow/ArrowOptions.cmake" - # Check for Parquet - grep -i 'set(ARROW_PARQUET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + + arrow_built_with() { + # Function to check cmake options for features + grep -i 'set('"$1"' "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 + } + + if arrow_built_with ARROW_PARQUET; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET" PKG_LIBS="-lparquet $PKG_LIBS" # NOTE: parquet is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for Arrow Dataset subcomponent - grep -i 'set(ARROW_DATASET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_DATASET; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_DATASET" PKG_LIBS="-larrow_dataset $PKG_LIBS" # NOTE: arrow-dataset is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for Arrow Substrait subcomponent - grep -i 'set(ARROW_SUBSTRAIT "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_SUBSTRAIT; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_SUBSTRAIT" PKG_LIBS="-larrow_substrait $PKG_LIBS" # NOTE: arrow-substrait is assumed to have the same -L flag as arrow # so there is no need to add its location to PKG_DIRS fi - # Check for S3 - grep -i 'set(ARROW_S3 "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then + if arrow_built_with ARROW_JSON; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" + fi + if arrow_built_with ARROW_S3; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3" if [ "$BUNDLED_LIBS" != "" ]; then # We're depending on openssl/curl from the system, so they're not in the bundled deps BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" fi fi - # Check for JSON - grep -i 'set(ARROW_JSON "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 - if [ $? -eq 0 ]; then - PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" + if arrow_built_with ARROW_GCS; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" + if [ "$BUNDLED_LIBS" != "" ]; then + # GCS also requires openssl and curl + BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" + fi fi + # prepend PKG_DIRS and append BUNDLED_LIBS to PKG_LIBS PKG_LIBS="$PKG_DIRS $PKG_LIBS $BUNDLED_LIBS" echo "PKG_CFLAGS=$PKG_CFLAGS" diff --git a/r/configure.win b/r/configure.win old mode 100644 new mode 100755 index 9e22136c79f40..dfd2c87ab4f15 --- a/r/configure.win +++ b/r/configure.win @@ -31,6 +31,9 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" +# pkg-config --libs libcurl +GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ + -lz -lws2_32" function configure_release() { VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) @@ -64,11 +67,11 @@ function configure_release() { -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 ${BROTLI_LIBS} -lole32 \ ${MIMALLOC_LIBS} ${OPENSSL_LIBS}" - # S3 and re2 support only for Rtools40 (i.e. R >= 4.0) + # S3, GCS, and re2 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 if [ $? -eq 0 ]; then - PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3" - PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}" + PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3 -DARROW_R_WITH_GCS" + PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS} ${GCS_LIBS}" else # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" @@ -104,6 +107,10 @@ function configure_dev() { PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3" fi + if [ $(cmake_option ARROW_GCS) -eq 1 ]; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_GCS" + fi + if [ $(cmake_option ARROW_JSON) -eq 1 ]; then PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" fi diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index fd1781c49fac8..92a4267153bef 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -30,7 +30,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("dataset", "substrait", "parquet", "s3", "json") +features <- c("dataset", "substrait", "parquet", "s3", "gcs", "json") suppressPackageStartupMessages({ library(decor) @@ -44,7 +44,9 @@ get_exported_functions <- function(decorations, export_tag) { out <- decorations %>% filter(decoration %in% paste0(export_tag, "::export")) %>% mutate(functions = map(context, decor:::parse_cpp_function)) %>% - { vec_cbind(., vec_rbind(!!!pull(., functions))) } %>% + { + vec_cbind(., vec_rbind(!!!pull(., functions))) + } %>% select(-functions) %>% mutate(decoration = sub("::export", "", decoration)) message(glue("*** > {n} functions decorated with [[{tags}::export]]", n = nrow(out), tags = paste0(export_tag, collapse = "|"))) @@ -58,7 +60,7 @@ glue_collapse_data <- function(data, ..., sep = ", ", last = "") { } wrap_call <- function(name, return_type, args) { - call <- glue::glue('{name}({list_params})', list_params = glue_collapse_data(args, "{name}")) + call <- glue::glue("{name}({list_params})", list_params = glue_collapse_data(args, "{name}")) if (return_type == "void") { glue::glue("\t{call};\n\treturn R_NilValue;", .trim = FALSE) } else { @@ -68,7 +70,7 @@ wrap_call <- function(name, return_type, args) { feature_available <- function(feat) { glue::glue( -'extern "C" SEXP _{feat}_available() {{ + 'extern "C" SEXP _{feat}_available() {{ return Rf_ScalarLogical( #if defined(ARROW_R_WITH_{toupper(feat)}) TRUE @@ -77,11 +79,12 @@ return Rf_ScalarLogical( #endif ); }} -') +' + ) } write_if_modified <- function(code, file) { - old <- try(readLines(file), silent=TRUE) + old <- try(readLines(file), silent = TRUE) new <- unclass(unlist(strsplit(code, "\n"))) # We don't care about changes in empty lines if (!identical(old[nzchar(old)], new[nzchar(new)])) { @@ -124,7 +127,7 @@ cpp_functions_definitions <- arrow_exports %>% select(name, return_type, args, file, line, decoration) %>% pmap_chr(function(name, return_type, args, file, line, decoration) { sexp_params <- glue_collapse_data(args, "SEXP {name}_sexp") - sexp_signature <- glue('_arrow_{name}({sexp_params})') + sexp_signature <- glue("_arrow_{name}({sexp_params})") cpp11_wrapped <- glue(' {return_type} {name}({real_params}); extern "C" SEXP {sexp_signature}{{ @@ -135,12 +138,13 @@ cpp_functions_definitions <- arrow_exports %>% sep = "\n", real_params = glue_collapse_data(args, "{type} {name}"), input_params = glue_collapse_data(args, "\tarrow::r::Input<{type}>::type {name}({name}_sexp);", sep = "\n"), - return_line = if (nrow(args)) "\n" else "") + return_line = if (nrow(args)) "\n" else "" + ) - glue::glue(' + glue::glue(" // {basename(file)} {ifdef_wrap(cpp11_wrapped, name, sexp_signature, decoration)} - ', + ", sep = "\n", ) }) %>% @@ -161,25 +165,25 @@ cpp_file_header <- '// Generated by using data-raw/codegen.R -> do not edit by h ' arrow_exports_cpp <- paste0( -glue::glue(' + glue::glue(" {cpp_file_header} {cpp_functions_definitions} -\n'), -glue::glue_collapse(glue::glue(' +\n"), + glue::glue_collapse(glue::glue(" {feature_available({features})} -'), sep = '\n'), -' +"), sep = "\n"), + " static const R_CallMethodDef CallEntries[] = { -', -glue::glue_collapse(glue::glue( - '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', -), sep = '\n'), -glue::glue('\n +", + glue::glue_collapse(glue::glue( + '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', + ), sep = "\n"), + glue::glue("\n {cpp_functions_registration} \t\t{{NULL, NULL, 0}} }}; -\n'), -'extern "C" void R_init_arrow(DllInfo* dll){ +\n"), + 'extern "C" void R_init_arrow(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); @@ -188,7 +192,8 @@ glue::glue('\n #endif } -\n') +\n' +) write_if_modified(arrow_exports_cpp, "src/arrowExports.cpp") @@ -200,27 +205,27 @@ r_functions <- arrow_exports %>% } else { "" } - call <- glue::glue('.Call(`_arrow_{name}`{params})') + call <- glue::glue(".Call(`_arrow_{name}`{params})") if (return_type == "void") { - call <- glue::glue('invisible({call})') + call <- glue::glue("invisible({call})") } - glue::glue(' + glue::glue(" {name} <- function({list_params}) {{ {call} }} - ', + ", list_params = glue_collapse_data(args, "{name}"), sep = "\n", ) }) %>% glue_collapse(sep = "\n") -arrow_exports_r <- glue::glue(' +arrow_exports_r <- glue::glue(" # Generated by using data-raw/codegen.R -> do not edit by hand {r_functions} -') +") write_if_modified(arrow_exports_r, "R/arrowExports.R") diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 27d42d4702528..3e6b0546b1c4c 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -59,6 +59,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS=${ARROW_GCS:-$ARROW_DEFAULT_PARAM} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_JSON=${ARROW_JSON:-ON} \ diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 887327d48f982..947270199ab12 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3222,6 +3222,22 @@ extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_GCS) +std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, cpp11::list options); +extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ +BEGIN_CPP11 + arrow::r::Input::type anonymous(anonymous_sexp); + arrow::r::Input::type options(options_sexp); + return cpp11::as_sexp(fs___GcsFileSystem__Make(anonymous, options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ + Rf_error("Cannot call fs___GcsFileSystem__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5099,6 +5115,15 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _gcs_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_GCS) + TRUE +#else + FALSE +#endif +); +} extern "C" SEXP _json_available() { return Rf_ScalarLogical( #if defined(ARROW_R_WITH_JSON) @@ -5113,6 +5138,7 @@ static const R_CallMethodDef CallEntries[] = { { "_substrait_available", (DL_FUNC)& _substrait_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, + { "_gcs_available", (DL_FUNC)& _gcs_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_test_SET_STRING_ELT", (DL_FUNC) &_arrow_test_SET_STRING_ELT, 1}, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, @@ -5436,6 +5462,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 15}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index bcafef34e41ea..cdf536b3bc85b 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -33,6 +33,13 @@ const char* r6_class_name::get( return "LocalFileSystem"; } else if (type_name == "s3") { return "S3FileSystem"; + } else if (type_name == "gcs") { + return "GcsFileSystem"; + // Uncomment these once R6 classes for these filesystems are added + // } else if (type_name == "abfs") { + // return "AzureBlobFileSystem"; + // } else if (type_name == "hdfs") { + // return "HadoopFileSystem"; } else if (type_name == "subtree") { return "SubTreeFileSystem"; } else { @@ -335,3 +342,77 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f } #endif + +#if defined(ARROW_R_WITH_GCS) + +#include + +std::shared_ptr strings_to_kvm(cpp11::strings metadata); + +// [[gcs::export]] +std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, + cpp11::list options) { + fs::GcsOptions gcs_opts; + + // Handle auth (anonymous, credentials, default) + // (validation/internal coherence handled in R) + if (anonymous) { + gcs_opts = fs::GcsOptions::Anonymous(); + } else if (!Rf_isNull(options["access_token"])) { + // Convert POSIXct timestamp seconds to nanoseconds + std::chrono::nanoseconds ns_count( + static_cast(cpp11::as_cpp(options["expiration"])) * 1000000000); + auto expiration_timepoint = + fs::TimePoint(std::chrono::duration_cast(ns_count)); + gcs_opts = fs::GcsOptions::FromAccessToken( + cpp11::as_cpp(options["access_token"]), expiration_timepoint); + // TODO(ARROW-16885): implement FromImpersonatedServiceAccount + // } else if (base_credentials != "") { + // // static GcsOptions FromImpersonatedServiceAccount( + // // const GcsCredentials& base_credentials, const std::string& + // target_service_account); + // // TODO: construct GcsCredentials + // gcs_opts = fs::GcsOptions::FromImpersonatedServiceAccount(base_credentials, + // target_service_account); + } else if (!Rf_isNull(options["json_credentials"])) { + gcs_opts = fs::GcsOptions::FromServiceAccountCredentials( + cpp11::as_cpp(options["json_credentials"])); + } else { + gcs_opts = fs::GcsOptions::Defaults(); + } + + // Handle other attributes + if (!Rf_isNull(options["endpoint_override"])) { + gcs_opts.endpoint_override = cpp11::as_cpp(options["endpoint_override"]); + } + + if (!Rf_isNull(options["scheme"])) { + gcs_opts.scheme = cpp11::as_cpp(options["scheme"]); + } + + // /// \brief Location to use for creating buckets. + if (!Rf_isNull(options["default_bucket_location"])) { + gcs_opts.default_bucket_location = + cpp11::as_cpp(options["default_bucket_location"]); + } + // /// \brief If set used to control total time allowed for retrying underlying + // /// errors. + // /// + // /// The default policy is to retry for up to 15 minutes. + if (!Rf_isNull(options["retry_limit_seconds"])) { + gcs_opts.retry_limit_seconds = cpp11::as_cpp(options["retry_limit_seconds"]); + } + + // /// \brief Default metadata for OpenOutputStream. + // /// + // /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + if (!Rf_isNull(options["default_metadata"])) { + gcs_opts.default_metadata = strings_to_kvm(options["default_metadata"]); + } + + auto io_context = arrow::io::IOContext(gc_memory_pool()); + // TODO(ARROW-16884): update when this returns Result + return fs::GcsFileSystem::Make(gcs_opts, io_context); +} + +#endif diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R new file mode 100644 index 0000000000000..a823442f30b57 --- /dev/null +++ b/r/tests/testthat/test-gcs.R @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("gcs") + +test_that("FileSystem$from_uri with gs://", { + fs_and_path <- FileSystem$from_uri("gs://my/test/bucket/") + expect_r6_class(fs_and_path$fs, "GcsFileSystem") + expect_identical(fs_and_path$path, "my/test/bucket") +}) + +test_that("GcsFileSystem$create() options", { + # TODO: expose options as a list so we can confirm they are set? + expect_r6_class(GcsFileSystem$create(), "GcsFileSystem") + expect_r6_class(GcsFileSystem$create(anonymous = TRUE), "GcsFileSystem") + expect_r6_class( + GcsFileSystem$create( + anonymous = TRUE, + scheme = "http", + endpoint_override = "localhost:8888", + default_bucket_location = "here", + retry_limit_seconds = 30, + default_metadata = c(a = "list", of = "stuff") + ), + "GcsFileSystem" + ) +}) + +test_that("GcsFileSystem$create() input validation", { + expect_error( + GcsFileSystem$create(anonymous = TRUE, access_token = "something"), + 'Cannot specify "access_token" when anonymous = TRUE' + ) + expect_error( + GcsFileSystem$create(expiration = Sys.time()), + "token auth requires both 'access_token' and 'expiration'" + ) + expect_error( + GcsFileSystem$create(json_credentials = "{}", expiration = Sys.time()), + "Cannot provide access_token with json_credentials" + ) + expect_error( + GcsFileSystem$create(role_arn = "something"), + 'Invalid options for GcsFileSystem: "role_arn"' + ) +}) diff --git a/r/tools/autobrew b/r/tools/autobrew index 25b6fa97d894a..8ba06a64c27d6 100644 --- a/r/tools/autobrew +++ b/r/tools/autobrew @@ -60,6 +60,7 @@ for FILE in $BREWDIR/Cellar/*/*/lib/*.a; do PKG_LIBS=`echo $PKG_LIBS | sed "s/-l$LIBNAME/-lbrew$LIBNAME/g"` done +# TODO: add -DARROW_R_WITH_GCS PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3" unset HOMEBREW_NO_ANALYTICS diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 768c6291939aa..0dadaa0ef73b2 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -69,7 +69,8 @@ download_binary <- function(os = identify_os()) { if (try_download(binary_url, libfile)) { cat(sprintf("*** Successfully retrieved C++ binaries for %s\n", os)) if (!identical(os, "centos-7")) { - # centos-7 uses gcc 4.8 so the binary doesn't have ARROW_S3=ON but the others do + # centos-7 uses gcc 4.8 so the binary doesn't have ARROW_S3=ON + # or ARROW_GCS=ON but the others do # TODO: actually check for system requirements? cat("**** Binary package requires libcurl and openssl\n") cat("**** If installation fails, retry after installing those system requirements\n") @@ -312,11 +313,11 @@ build_libarrow <- function(src_dir, dst_dir) { # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS") ) - env_var_list <- with_s3_support(env_var_list) + env_var_list <- with_cloud_support(env_var_list) env_var_list <- with_mimalloc(env_var_list) # turn_off_all_optional_features() needs to happen after with_mimalloc() and - # with_s3_support(), since those might turn features ON. + # with_cloud_support(), since those might turn features ON. thirdparty_deps_unavailable <- !download_ok && !dir.exists(thirdparty_dependency_dir) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") @@ -538,24 +539,45 @@ with_mimalloc <- function(env_var_list) { replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) } -with_s3_support <- function(env_var_list) { +with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3") - if (arrow_s3) { - # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 + arrow_gcs <- is_feature_requested("ARROW_GCS") + if (arrow_s3 || arrow_gcs) { + # User wants S3 or GCS support. + # If they're using gcc, let's make sure the version is >= 4.9 + # (aws-sdk-cpp requires that; google-cloud-cpp only tests with >= 6.3) # and make sure that we have curl and openssl system libs + feats <- c( + if (arrow_s3) "S3", + if (arrow_gcs) "GCS" + ) + start_msg <- paste(feats, collapse = "/") + off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") + print_warning <- function(msg) { + # Utility to assemble warning message in the console + cat("**** ", start_msg, " support ", msg, "; building with ", off_flags, "\n") + } + + # Check the features if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { - cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n") + print_warning("not available for gcc < 4.9") arrow_s3 <- FALSE + arrow_gcs <- FALSE } else if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos - cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n") + print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb") arrow_s3 <- FALSE + arrow_gcs <- FALSE } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { - cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") + print_warning("requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew)") arrow_s3 <- FALSE + arrow_gcs <- FALSE } } - replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) + + # Update the build flags + env_var_list <- replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) + replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) } cmake_gcc_version <- function(env_var_list) { diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index 159a43808eca0..af312e30b89d4 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -237,6 +237,7 @@ cmake \ To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags to your call to `cmake` (the trailing `\` makes them easier to paste into a bash shell on a new line): ```bash + -DARROW_GCS=ON \ -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ -DARROW_WITH_BROTLI=ON \ @@ -307,6 +308,7 @@ cmake \ -DARROW_DATASET=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON \ -DARROW_FILESYSTEM=ON \ + -DARROW_GCS=ON \ -DARROW_INSTALL_NAME_RPATH=OFF \ -DARROW_JEMALLOC=ON \ -DARROW_JSON=ON \ diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 2c402e162d7c3..257dd7b11d8e7 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -13,14 +13,14 @@ In most cases, `install.packages("arrow")` should just work. There are things yo ---- -The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. +The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. This vignette outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customisable to the most complex but with more flexbility to customise your installation. The intended audience for this document is arrow R package _users_ on Linux, and not Arrow _developers_. If you're contributing to the Arrow project, see `vignette("developing", package = "arrow")` for resources to help you on set up your development environment. You can also find -a more detailed discussion of the code run during the installation process in the +a more detailed discussion of the code run during the installation process in the [developers' installation docs](https://arrow.apache.org/docs/r/articles/developers/install_details.html) > Having trouble installing arrow? See the "Troubleshooting" section below. @@ -59,11 +59,11 @@ install.packages("arrow", repos = "https://packagemanager.rstudio.com/all/__linu Note that the User Agent header must be specified as in the example above. Please check [the RStudio Package Manager: Admin Guide ](https://docs.rstudio.com/rspm/admin/serving-binaries/#using-linux-binary-packages) for more details. -For other Linux distributions, to get the relevant URL, you can visit +For other Linux distributions, to get the relevant URL, you can visit [the RSPM site](https://packagemanager.rstudio.com/client/#/repos/1/overview), click on 'binary', and select your preferred distribution. -Similarly, if you use `conda` to manage your R environment, you can get the +Similarly, if you use `conda` to manage your R environment, you can get the latest official release of the R package including libarrow via: ```shell @@ -87,34 +87,36 @@ This installs the source version of the R package, but during the installation p # Installing libarrow dependencies -When you install libarrow, its dependencies will be automatically downloaded. +When you install libarrow, its dependencies will be automatically downloaded. The environment variable `ARROW_DEPENDENCY_SOURCE` controls whether the libarrow -installation also downloads or installs all dependencies (when set to `BUNDLED`), -uses only system-installed dependencies (when set to `SYSTEM`) or checks -system-installed dependencies first and only installs dependencies which aren't +installation also downloads or installs all dependencies (when set to `BUNDLED`), +uses only system-installed dependencies (when set to `SYSTEM`) or checks +system-installed dependencies first and only installs dependencies which aren't already present (when set to `AUTO`). -These dependencies vary by platform; however, if you wish to install these -yourself prior to libarrow installation, we recommend that you take a look at +These dependencies vary by platform; however, if you wish to install these +yourself prior to libarrow installation, we recommend that you take a look at the [docker file for whichever of our CI builds](https://github.com/apache/arrow/tree/master/ci/docker) -(the ones ending in "cpp" are for building Arrow's C++ libaries aka libarrow) -corresponds most closely to your setup. This will contain the most up-to-date +(the ones ending in "cpp" are for building Arrow's C++ libaries aka libarrow) +corresponds most closely to your setup. This will contain the most up-to-date information about dependencies and minimum versions. -## Dependencies for S3 support +## Dependencies for S3 and GCS support The arrow package allows you to work with data in AWS S3 or in other cloud -storage system that emulate S3. However, support for working with S3 is not +storage system that emulate S3, as well as Google Cloud Storage. +However, support for working with S3 and GCS is not enabled in the default build, and it has additional system requirements. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set -`ARROW_S3=ON`. You also need the following system dependencies: +`ARROW_S3=ON` and/or `ARROW_GCS=ON`. +You also need the following system dependencies: * `gcc` >= 4.9 or `clang` >= 3.3; note that the default compiler on CentOS 7 is gcc 4.8.5, which is not sufficient * CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) * OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb) -The prebuilt libarrow binaries come with S3 support enabled, so you will need to meet these system requirements in order to use them--the package will not install without them (and will error with a message that explains this).If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 support in the build if the prerequisites are not met--installation will succeed but without S3 functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 support. +The prebuilt libarrow binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them--the package will not install without them (and will error with a message that explains this).If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. # Installing a release version (the less easy way) @@ -124,60 +126,60 @@ The prebuilt libarrow binaries come with S3 support enabled, so you will need to knitr::include_graphics("./r_source_libarrow_source.png") ``` -Generally compiling and installing R packages with C++ dependencies, requires -either installing system packages, which you may not have privileges to do, or -building the C++ dependencies separately, which introduces all sorts of +Generally compiling and installing R packages with C++ dependencies, requires +either installing system packages, which you may not have privileges to do, or +building the C++ dependencies separately, which introduces all sorts of additional ways for things to go wrong, which is why we recommend method 1 above. -However, if you wish to fine-tune or customise your Linux installation, the +However, if you wish to fine-tune or customise your Linux installation, the instructions in this section explain how to do that. ### Basic configuration for building from source with fully featured installation -If you wish to install libarrow from source instead of looking for pre-compiled +If you wish to install libarrow from source instead of looking for pre-compiled binaries, you can set the `LIBARROW_BINARY` variable. ```{r, eval = FALSE} Sys.setenv("LIBARROW_BINARY" = FALSE) ``` -By default, this is set to `TRUE`, and so libarrow will only be built from -source if this environment variable is set to `FALSE` or no compatible binary +By default, this is set to `TRUE`, and so libarrow will only be built from +source if this environment variable is set to `FALSE` or no compatible binary for your OS can be found. -When compiling libarrow from source, you have the power to really fine-tune -which features to install. You can set the environment variable -`LIBARROW_MINIMAL` to `FALSE` to enable a more full-featured build including S3 support +When compiling libarrow from source, you have the power to really fine-tune +which features to install. You can set the environment variable +`LIBARROW_MINIMAL` to `FALSE` to enable a more full-featured build including S3 support and alternative memory allocators. ```{r, eval = FALSE} Sys.setenv("LIBARROW_MINIMAL" = FALSE) ``` -By default this variable is unset; if set to `TRUE` a trimmed-down version of +By default this variable is unset; if set to `TRUE` a trimmed-down version of arrow is installed with many features disabled. -Note that in this guide, you will have seen us mention the environment variable -`NOT_CRAN` - this is a convenience variable, which when set to `TRUE`, +Note that in this guide, you will have seen us mention the environment variable +`NOT_CRAN` - this is a convenience variable, which when set to `TRUE`, automatically sets `LIBARROW_MINIMAL` to `FALSE` and `LIBARROW_BINARY` to `TRUE`. -Building libarrow from source requires more time and resources than installing -a binary. We recommend that you set the environment variable `ARROW_R_DEV` to -`TRUE` for more verbose output during the installation process if anything goes +Building libarrow from source requires more time and resources than installing +a binary. We recommend that you set the environment variable `ARROW_R_DEV` to +`TRUE` for more verbose output during the installation process if anything goes wrong. ```{r, eval = FALSE} Sys.setenv("ARROW_R_DEV" = TRUE) ``` -Once you have set these variables, call `install.packages()` to install arrow +Once you have set these variables, call `install.packages()` to install arrow using this configuration. ```{r, eval = FALSE} install.packages("arrow") ``` -The section below discusses environment variables you can set before calling +The section below discusses environment variables you can set before calling `install.packages("arrow")` to build from source and customise your configuration. ### Advanced configuration for building from source @@ -187,13 +189,14 @@ In this section, we describe how to fine-tune your installation at a more granul #### libarrow configuration Some features are optional when you build Arrow from source - you can configure -whether these components are built via the use of environment variables. The -names of the environment variables which control these features and their +whether these components are built via the use of environment variables. The +names of the environment variables which control these features and their default values are shown below. | Name | Description | Default Value | | ---| --- | :-: | | `ARROW_S3` | S3 support (if dependencies are met)* | `OFF` | +| `ARROW_GCS` | GCS support (if dependencies are met)* | `OFF` | | `ARROW_JEMALLOC` | The `jemalloc` memory allocator | `ON` | | `ARROW_MIMALLOC` | The `mimalloc` memory allocator | `ON` | | `ARROW_PARQUET` | | `ON` | @@ -210,7 +213,7 @@ default values are shown below. #### R package configuration -There are a number of other variables that affect the `configure` script and +There are a number of other variables that affect the `configure` script and the bundled build script. All boolean variables are case-insensitive. | Name | Description | Default | @@ -227,18 +230,18 @@ the bundled build script. All boolean variables are case-insensitive. See below for more in-depth explanations of these environment variables. * `LIBARROW_BINARY` : If set to `true`, the script will try to download a binary - C++ library built for your operating system. You may also set it to some other string, a related "distro-version" that has binaries built that work for your OS. See the [distro map](https://raw.githubusercontent.com/ursa-labs/arrow-r-nightly/master/linux/distro-map.csv) for compatible binaries and OSs. If no binary is found, installation will fall back to building C++ dependencies from source. + C++ library built for your operating system. You may also set it to some other string, a related "distro-version" that has binaries built that work for your OS. See the [distro map](https://raw.githubusercontent.com/ursa-labs/arrow-r-nightly/master/linux/distro-map.csv) for compatible binaries and OSs. If no binary is found, installation will fall back to building C++ dependencies from source. * `LIBARROW_BUILD` : If set to `false`, the build script will not attempt to build the C++ from source. This means you will only get a working arrow R package if a prebuilt binary is found. Use this if you want to avoid compiling the C++ library, which may be slow - and resource-intensive, and ensure that you only use a prebuilt binary. + and resource-intensive, and ensure that you only use a prebuilt binary. * `LIBARROW_MINIMAL` : If set to `false`, the build script will enable some optional features, including S3 support and additional alternative memory allocators. This will increase the - source build time but results in a more fully functional library. If set to - `true` turns off Parquet, Datasets, compression libraries, and other optional - features. This is not commonly used but may be helpful if needing to compile + source build time but results in a more fully functional library. If set to + `true` turns off Parquet, Datasets, compression libraries, and other optional + features. This is not commonly used but may be helpful if needing to compile on a platform that does not support these features, e.g. Solaris. * `NOT_CRAN` : If this variable is set to `true`, as the `devtools` package does, the build script will set `LIBARROW_BINARY=true` and `LIBARROW_MINIMAL=false` @@ -250,7 +253,7 @@ See below for more in-depth explanations of these environment variables. in the build script. `arrow::install_arrow(verbose = TRUE)` sets this. This variable also is needed if you're modifying C++ code in the package: see the developer guide vignette. -* `ARROW_USE_PKG_CONFIG`: If set to `false`, the configure script won't look for +* `ARROW_USE_PKG_CONFIG`: If set to `false`, the configure script won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. @@ -266,7 +269,7 @@ Arrow libraries on your system and instead will look to download/build them. # Install the nightly build -Daily development builds, which are not official releases, can be installed +Daily development builds, which are not official releases, can be installed from the Ursa Labs repository: ```r @@ -292,12 +295,12 @@ R CMD INSTALL . If you don't already have libarrow on your system, when installing the R package from source, it will also download and build -libarrow for you. See the section above on build environment +libarrow for you. See the section above on build environment variables for options for configuring the build source and enabled features. # Installation using install_arrow() -The previous instructions are useful for a fresh arrow installation, but arrow +The previous instructions are useful for a fresh arrow installation, but arrow provides the function `install_arrow()`, which you can use if you: * already have arrow installed and want to upgrade to a different version @@ -307,7 +310,7 @@ provides the function `install_arrow()`, which you can use if you: `install_arrow()` provides some convenience wrappers around the various environment variables described below. -Although this function is part of the arrow package, it is also available as +Although this function is part of the arrow package, it is also available as a standalone script, so you can access it for convenience without first installing the package: ```r @@ -404,8 +407,8 @@ in the output when the package fails to install, that means that installation failed to retrieve or build the libarrow version compatible with the current version of the R package. -Please check the "Known installation issues" below to see if any apply, and if -none apply, set the environment variable `ARROW_R_DEV=TRUE` for more verbose +Please check the "Known installation issues" below to see if any apply, and if +none apply, set the environment variable `ARROW_R_DEV=TRUE` for more verbose output and try installing again. Then, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) and include the full installation output.