Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into apachegh-36765-parq…
Browse files Browse the repository at this point in the history
…uet-pre-buffer
  • Loading branch information
jorisvandenbossche committed Oct 5, 2023
2 parents 16e56fa + 02de3c1 commit ed1b46f
Show file tree
Hide file tree
Showing 279 changed files with 7,191 additions and 2,762 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ jobs:
restore-keys: ${{ matrix.image }}-
- name: Setup Python
run: |
sudo apt install -y --no-install-recommends python3 python3-pip
sudo apt update
sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
- name: Setup Archery
run: python3 -m pip install -e dev/archery[docker]
- name: Execute Docker Build
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ jobs:
submodules: recursive
- name: Setup Python
run: |
sudo apt install -y --no-install-recommends python3 python3-pip
sudo apt update
sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
- name: Setup Archery
run: python3 -m pip install -e dev/archery[docker]
- name: Execute Docker Build
Expand Down Expand Up @@ -232,7 +233,7 @@ jobs:
name: AMD64 Windows 2019 Go ${{ matrix.go }}
runs-on: windows-2019
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 15
timeout-minutes: 25
strategy:
fail-fast: false
matrix:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/java_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,6 @@ jobs:
fi
echo $PREFIX
archery crossbow download-artifacts -f java-jars -t binaries $PREFIX
- name: Cache Repo
uses: actions/cache@v3
with:
path: repo
key: java-nightly-${{ github.run_id }}
restore-keys: java-nightly
- name: Sync from Remote
uses: ./arrow/.github/actions/sync-nightlies
with:
Expand Down
2 changes: 1 addition & 1 deletion ci/conda_env_sphinx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ breathe
doxygen
ipython
numpydoc
pydata-sphinx-theme==0.8
pydata-sphinx-theme
sphinx-autobuild
sphinx-design
sphinx-copybutton
Expand Down
4 changes: 3 additions & 1 deletion ci/docker/conda-integration.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ ARG go=1.19.13
# Install Archery and integration dependencies
COPY ci/conda_env_archery.txt /arrow/ci/

# Pin Python until pythonnet is made compatible with 3.12
# (https://github.com/pythonnet/pythonnet/pull/2249)
RUN mamba install -q -y \
--file arrow/ci/conda_env_archery.txt \
"python>=3.7" \
"python < 3.12" \
numpy \
compilers \
maven=${maven} \
Expand Down
19 changes: 19 additions & 0 deletions ci/scripts/go_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,22 @@ pushd ${source_dir}/parquet
go install -v ./...

popd

if [[ -n "${ARROW_GO_INTEGRATION}" ]]; then
pushd ${source_dir}/arrow/internal/cdata_integration

case "$(uname)" in
Linux)
go_lib="arrow_go_integration.so"
;;
Darwin)
go_lib="arrow_go_integration.so"
;;
MINGW*)
go_lib="arrow_go_integration.dll"
;;
esac
go build -tags cdata_integration,assert -buildmode=c-shared -o ${go_lib} .

popd
fi
5 changes: 5 additions & 0 deletions ci/scripts/integration_arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ arrow_dir=${1}
gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration

pip install -e $arrow_dir/dev/archery[integration]
# For C# C Data Interface testing
pip install pythonnet

# Get more detailed context on crashes
export PYTHONFAULTHANDLER=1

# Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1
time archery integration \
Expand Down
12 changes: 7 additions & 5 deletions ci/scripts/js_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ yarn lint:ci
yarn build

if [ "${BUILD_DOCS_JS}" == "ON" ]; then
if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then
yarn doc
elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then
yarn doc --gitRemote upstream
elif [ "$(git config --get remote.apache.url)" == "[email protected]:apache/arrow.git" ]; then
# If apache or upstream are defined use those as remote.
# Otherwise use origin which could be a fork on PRs.
if [ "$(git config --get remote.apache.url)" == "[email protected]:apache/arrow.git" ]; then
yarn doc --gitRemote apache
elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then
yarn doc --gitRemote upstream
elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then
yarn doc
else
echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to [email protected]:apache/arrow.git."
exit 0
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}")
set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support")

set(ARROW_LLVM_VERSIONS
"17.0"
"16.0"
"15.0"
"14.0"
Expand Down
24 changes: 14 additions & 10 deletions cpp/cmake_modules/FindLLVMAlt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,20 @@ if(LLVM_FOUND)
target_link_libraries(LLVM::LLVM_LIBS INTERFACE LLVM)
else()
# Find the libraries that correspond to the LLVM components
llvm_map_components_to_libnames(LLVM_LIBS
core
mcjit
native
ipo
bitreader
target
linker
analysis
debuginfodwarf)
set(LLVM_TARGET_COMPONENTS
analysis
bitreader
core
debuginfodwarf
ipo
linker
mcjit
native
target)
if(LLVM_VERSION_MAJOR GREATER_EQUAL 14)
list(APPEND LLVM_TARGET_COMPONENTS passes)
endif()
llvm_map_components_to_libnames(LLVM_LIBS ${LLVM_TARGET_COMPONENTS})
target_link_libraries(LLVM::LLVM_LIBS INTERFACE ${LLVM_LIBS})

if(TARGET LLVMSupport AND NOT ARROW_ZSTD_USE_SHARED)
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,22 @@ std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) cons
return i == -1 ? nullptr : field(i);
}

Status StructArray::CanReferenceFieldByName(const std::string& name) const {
if (GetFieldByName(name) == nullptr) {
return Status::Invalid("Field named '", name,
"' not found or not unique in the struct.");
}
return Status::OK();
}

Status StructArray::CanReferenceFieldsByNames(
const std::vector<std::string>& names) const {
for (const auto& name : names) {
ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name));
}
return Status::OK();
}

Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
ArrayVector flattened;
flattened.resize(data_->child_data.size());
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,12 @@ class ARROW_EXPORT StructArray : public Array {
/// Returns null if name not found
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;

/// Indicate if field named `name` can be found unambiguously in the struct.
Status CanReferenceFieldByName(const std::string& name) const;

/// Indicate if fields named `names` can be found unambiguously in the struct.
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;

/// \brief Flatten this array as a vector of arrays, one for each field
///
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Expand Down
52 changes: 52 additions & 0 deletions cpp/src/arrow/array/array_struct_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,58 @@ TEST(StructArray, FlattenOfSlice) {
ASSERT_OK(arr->ValidateFull());
}

TEST(StructArray, CanReferenceFieldByName) {
auto a = ArrayFromJSON(int8(), "[4, 5]");
auto b = ArrayFromJSON(int16(), "[6, 7]");
auto c = ArrayFromJSON(int32(), "[8, 9]");
auto d = ArrayFromJSON(int64(), "[10, 11]");
auto children = std::vector<std::shared_ptr<Array>>{a, b, c, d};

auto f0 = field("f0", int8());
auto f1 = field("f1", int16());
auto f2 = field("f2", int32());
auto f3 = field("f1", int64());
auto type = struct_({f0, f1, f2, f3});

auto arr = std::make_shared<StructArray>(type, 2, children);

ASSERT_OK(arr->CanReferenceFieldByName("f0"));
ASSERT_OK(arr->CanReferenceFieldByName("f2"));
// Not found
ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("nope"));

// Duplicates
ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("f1"));
}

TEST(StructArray, CanReferenceFieldsByNames) {
auto a = ArrayFromJSON(int8(), "[4, 5]");
auto b = ArrayFromJSON(int16(), "[6, 7]");
auto c = ArrayFromJSON(int32(), "[8, 9]");
auto d = ArrayFromJSON(int64(), "[10, 11]");
auto children = std::vector<std::shared_ptr<Array>>{a, b, c, d};

auto f0 = field("f0", int8());
auto f1 = field("f1", int16());
auto f2 = field("f2", int32());
auto f3 = field("f1", int64());
auto type = struct_({f0, f1, f2, f3});

auto arr = std::make_shared<StructArray>(type, 2, children);

ASSERT_OK(arr->CanReferenceFieldsByNames({"f0", "f2"}));
ASSERT_OK(arr->CanReferenceFieldsByNames({"f2", "f0"}));

// Not found
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"nope"}));
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "nope"}));
// Duplicates
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f1"}));
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1"}));
// Both
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1", "nope"}));
}

// ----------------------------------------------------------------------------------
// Struct test
class TestStructBuilder : public ::testing::Test {
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/arrow/arrow-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ message(WARNING "find_package(arrow) is deprecated. Use find_package(Arrow) inst
find_package(Arrow CONFIG)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(arrow
REQUIRED_VARS
ARROW_INCLUDE_DIR
VERSION_VAR
ARROW_VERSION)
find_package_handle_standard_args(
arrow
REQUIRED_VARS ARROW_INCLUDE_DIR
VERSION_VAR ARROW_VERSION)
49 changes: 40 additions & 9 deletions cpp/src/arrow/compute/kernels/codegen_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) {
bool saw_date32 = false;
bool saw_date64 = false;
bool saw_duration = false;
bool saw_time_since_midnight = false;
const TypeHolder* end = begin + count;
for (auto it = begin; it != end; it++) {
auto id = it->type->id();
Expand All @@ -271,6 +272,18 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) {
finest_unit = std::max(finest_unit, ty.unit());
continue;
}
case Type::TIME32: {
const auto& type = checked_cast<const Time32Type&>(*it->type);
finest_unit = std::max(finest_unit, type.unit());
saw_time_since_midnight = true;
continue;
}
case Type::TIME64: {
const auto& type = checked_cast<const Time64Type&>(*it->type);
finest_unit = std::max(finest_unit, type.unit());
saw_time_since_midnight = true;
continue;
}
case Type::DURATION: {
const auto& ty = checked_cast<const DurationType&>(*it->type);
finest_unit = std::max(finest_unit, ty.unit());
Expand All @@ -282,15 +295,33 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) {
}
}

if (timezone) {
// At least one timestamp seen
return timestamp(finest_unit, *timezone);
} else if (saw_date64) {
return date64();
} else if (saw_date32) {
return date32();
} else if (saw_duration) {
return duration(finest_unit);
bool saw_timestamp_or_date = timezone || saw_date64 || saw_date32 || saw_duration;

if (saw_time_since_midnight && saw_timestamp_or_date) {
// Cannot find common type
return TypeHolder(nullptr);
}
if (saw_timestamp_or_date) {
if (timezone) {
// At least one timestamp seen
return timestamp(finest_unit, *timezone);
} else if (saw_date64) {
return date64();
} else if (saw_date32) {
return date32();
} else if (saw_duration) {
return duration(finest_unit);
}
}
if (saw_time_since_midnight) {
switch (finest_unit) {
case TimeUnit::SECOND:
case TimeUnit::MILLI:
return time32(finest_unit);
case TimeUnit::MICRO:
case TimeUnit::NANO:
return time64(finest_unit);
}
}
return TypeHolder(nullptr);
}
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/arrow/compute/kernels/codegen_internal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,18 @@ TEST(TestDispatchBest, CommonTemporal) {
args = {timestamp(TimeUnit::SECOND, "America/Phoenix"),
timestamp(TimeUnit::SECOND, "UTC")};
ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);

args = {time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)};
AssertTypeEqual(*time32(TimeUnit::MILLI), *CommonTemporal(args.data(), args.size()));

args = {time32(TimeUnit::SECOND), time64(TimeUnit::NANO)};
AssertTypeEqual(*time64(TimeUnit::NANO), *CommonTemporal(args.data(), args.size()));

args = {date32(), time32(TimeUnit::SECOND)};
ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);

args = {timestamp(TimeUnit::SECOND), time32(TimeUnit::SECOND)};
ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
}

TEST(TestDispatchBest, CommonTemporalResolution) {
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/dataset/file_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ Result<std::shared_ptr<io::RandomAccessFile>> FileSource::Open() const {
return custom_open_();
}

Future<std::shared_ptr<io::RandomAccessFile>> FileSource::OpenAsync() const {
if (filesystem_) {
return filesystem_->OpenInputFileAsync(file_info_);
}

if (buffer_) {
return Future<std::shared_ptr<io::RandomAccessFile>>::MakeFinished(
std::make_shared<io::BufferReader>(buffer_));
}

// TODO(GH-37962): custom_open_ should not block
return Future<std::shared_ptr<io::RandomAccessFile>>::MakeFinished(custom_open_());
}

int64_t FileSource::Size() const {
if (filesystem_) {
return file_info_.size();
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/file_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {

/// \brief Get a RandomAccessFile which views this file source
Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
Future<std::shared_ptr<io::RandomAccessFile>> OpenAsync() const;

/// \brief Get the size (in bytes) of the file or buffer
/// If the file is compressed this should be the compressed (on-disk) size.
Expand Down
Loading

0 comments on commit ed1b46f

Please sign in to comment.