Skip to content

Commit

Permalink
Merge branch 'apache:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Yimche authored Oct 26, 2024
2 parents 39138e2 + a3c39ec commit de5a11c
Show file tree
Hide file tree
Showing 45 changed files with 841 additions and 257 deletions.
2 changes: 1 addition & 1 deletion .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

github:
description: "Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing"
description: "Apache Arrow is the universal columnar format and multi-language toolbox for fast data interchange and in-memory analytics"
homepage: https://arrow.apache.org/
collaborators:
- anjakefala
Expand Down
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ repos:
hooks:
- id: clang-format
name: C++ Format
alias: cpp-format
types_or:
- c++
# - json
Expand Down Expand Up @@ -103,20 +104,23 @@ repos:
hooks:
- id: clang-format
name: C/GLib Format
alias: c-glib-cpp-format
files: >-
^c_glib/
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v14.0.6
hooks:
- id: clang-format
name: MATLAB (C++) Format
alias: matlab-cpp-format
files: >-
^matlab/src/cpp/
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v14.0.6
hooks:
- id: clang-format
name: Python (C++) Format
alias: python-cpp-format
files: >-
^python/pyarrow/src/
exclude: >-
Expand All @@ -130,6 +134,7 @@ repos:
hooks:
- id: clang-format
name: R (C++) Format
alias: r-cpp-format
files: >-
^r/src/
exclude: >-
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@

## Powering In-Memory Analytics

Apache Arrow is a development platform for in-memory analytics. It contains a
set of technologies that enable big data systems to process and move data fast.
Apache Arrow is a universal columnar format and multi-language toolbox for fast
data interchange and in-memory analytics. It contains a set of technologies that
enable data systems to efficiently store, process, and move data.

Major components of the project include:

Expand Down
6 changes: 3 additions & 3 deletions c_glib/arrow-glib/basic-data-type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1660,9 +1660,9 @@ enum {
PROP_STORAGE_DATA_TYPE = 1
};

G_DEFINE_TYPE_WITH_PRIVATE(GArrowExtensionDataType,
garrow_extension_data_type,
GARROW_TYPE_DATA_TYPE)
G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowExtensionDataType,
garrow_extension_data_type,
GARROW_TYPE_DATA_TYPE)

#define GARROW_EXTENSION_DATA_TYPE_GET_PRIVATE(obj) \
static_cast<GArrowExtensionDataTypePrivate *>( \
Expand Down
6 changes: 6 additions & 0 deletions c_glib/test/test-extension-data-type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,10 @@ def test_wrap_chunked_array
extension_chunked_array.chunks.collect(&:class),
])
end

def test_abstract_class
assert_raise(TypeError) do
Arrow::ExtensionDataType.new
end
end
end
2 changes: 1 addition & 1 deletion ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ set PARQUET_HOME=%CONDA_PREFIX%\Library

@rem Download IANA Timezone Database to a non-standard location to
@rem test the configurability of the timezone database path
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz || exit /B
curl https://data.iana.org/time-zones/releases/tzdata2024b.tar.gz --output tzdata.tar.gz || exit /B
mkdir %USERPROFILE%\Downloads\test\tzdata
tar --extract --file tzdata.tar.gz --directory %USERPROFILE%\Downloads\test\tzdata
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^
Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/download_tz_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
set -ex

# Download database
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output ~/Downloads/tzdata2021e.tar.gz
curl https://data.iana.org/time-zones/releases/tzdata2024b.tar.gz --output ~/Downloads/tzdata.tar.gz

# Extract
mkdir -p ~/Downloads/tzdata
tar --extract --file ~/Downloads/tzdata2021e.tar.gz --directory ~/Downloads/tzdata
tar --extract --file ~/Downloads/tzdata.tar.gz --directory ~/Downloads/tzdata

# Download Windows timezone mapping
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml --output ~/Downloads/tzdata/windowsZones.xml
5 changes: 5 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,11 @@ Always OFF if building binaries" OFF)
"Compiler flags to append when pre-compiling Gandiva operations"
"")

#----------------------------------------------------------------------
set_option_category("Cross compiling")

define_option_string(ARROW_GRPC_CPP_PLUGIN "grpc_cpp_plugin path to be used" "")

#----------------------------------------------------------------------
set_option_category("Advanced developer")

Expand Down
8 changes: 8 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4223,6 +4223,14 @@ if(ARROW_WITH_GRPC)
target_link_libraries(gRPC::grpc++ INTERFACE gRPC::grpc_asan_suppressed)
endif()
endif()

if(ARROW_GRPC_CPP_PLUGIN)
if(NOT TARGET gRPC::grpc_cpp_plugin)
add_executable(gRPC::grpc_cpp_plugin IMPORTED)
endif()
set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION
${ARROW_GRPC_CPP_PLUGIN})
endif()
endif()

# ----------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ endif()

set(ARROW_VENDORED_SRCS
vendored/base64.cpp
vendored/datetime/tz.cpp
vendored/datetime.cpp
vendored/double-conversion/bignum-dtoa.cc
vendored/double-conversion/bignum.cc
vendored/double-conversion/cached-powers.cc
Expand Down Expand Up @@ -488,7 +488,7 @@ set(ARROW_VENDORED_SRCS
if(APPLE)
list(APPEND ARROW_VENDORED_SRCS vendored/datetime/ios.mm)
endif()
set_source_files_properties(vendored/datetime/tz.cpp
set_source_files_properties(vendored/datetime.cpp
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
SKIP_UNITY_BUILD_INCLUSION ON)
arrow_add_object_library(ARROW_VENDORED ${ARROW_VENDORED_SRCS})
Expand Down
29 changes: 22 additions & 7 deletions cpp/src/arrow/io/buffered.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class BufferedBase {
return !is_open_;
}

// Allocate buffer_ if needed, and resize it to buffer_size_ if required.
Status ResetBuffer() {
if (!buffer_) {
// On first invocation, or if the buffer has been released, we allocate a
Expand Down Expand Up @@ -283,18 +284,32 @@ class BufferedInputStream::Impl : public BufferedBase {
}

// Resize internal read buffer. Note that the internal buffer-size
// should be not larger than the raw_read_bound_.
// should not be larger than the raw_read_bound_.
// It might change the buffer_size_, but will not change buffer states
// buffer_pos_ and bytes_buffered_.
Status SetBufferSize(int64_t new_buffer_size) {
if (new_buffer_size <= 0) {
return Status::Invalid("Buffer size should be positive");
}
if ((buffer_pos_ + bytes_buffered_) >= new_buffer_size) {
return Status::Invalid("Cannot shrink read buffer if buffered data remains");
return Status::Invalid(
"Cannot shrink read buffer if buffered data remains, new_buffer_size: ",
new_buffer_size, ", buffer_pos: ", buffer_pos_,
", bytes_buffered: ", bytes_buffered_, ", buffer_size: ", buffer_size_);
}
if (raw_read_bound_ >= 0) {
// No need to reserve space for more than the total remaining number of bytes.
new_buffer_size = std::min(new_buffer_size,
bytes_buffered_ + (raw_read_bound_ - raw_read_total_));
if (bytes_buffered_ == 0) {
// Special case: we can not keep the current buffer because it does not
// contain any required data.
new_buffer_size = std::min(new_buffer_size, raw_read_bound_ - raw_read_total_);
} else {
// We should keep the current buffer because it contains data that
// can be read.
new_buffer_size =
std::min(new_buffer_size,
buffer_pos_ + bytes_buffered_ + (raw_read_bound_ - raw_read_total_));
}
}
return ResizeBuffer(new_buffer_size);
}
Expand Down Expand Up @@ -350,7 +365,7 @@ class BufferedInputStream::Impl : public BufferedBase {
}

Status DoBuffer() {
// Fill buffer
// Fill the buffer from the raw stream with at most `buffer_size_` bytes.
if (!buffer_) {
RETURN_NOT_OK(ResetBuffer());
}
Expand Down Expand Up @@ -444,8 +459,8 @@ class BufferedInputStream::Impl : public BufferedBase {
// The default -1 indicates that it is unbounded
int64_t raw_read_bound_;

// Number of remaining bytes in the buffer, to be reduced on each read from
// the buffer
// Number of remaining valid bytes in the buffer, to be reduced on each read
// from the buffer.
int64_t bytes_buffered_;
};

Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/io/buffered.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class ARROW_EXPORT BufferedInputStream
int64_t raw_read_bound = -1);

/// \brief Resize internal read buffer; calls to Read(...) will read at least
/// this many bytes from the raw InputStream if possible.
/// \param[in] new_buffer_size the new read buffer size
/// \return Status
Status SetBufferSize(int64_t new_buffer_size);
Expand Down
23 changes: 23 additions & 0 deletions cpp/src/arrow/io/buffered_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,29 @@ TEST_F(TestBufferedInputStream, BufferSizeLimit) {
}
}

TEST_F(TestBufferedInputStream, PeekPastBufferedBytes) {
// GH-43949: Peek and SetBufferSize should not affect the
// buffered bytes.
MakeExample1(/*buffer_size=*/10, default_memory_pool(), /*raw_read_bound=*/15);
ASSERT_OK_AND_ASSIGN(auto bytes, buffered_->Read(9));
EXPECT_EQ(std::string_view(*bytes), kExample1.substr(0, 9));
ASSERT_EQ(1, buffered_->bytes_buffered());
ASSERT_EQ(10, buffered_->buffer_size());
ASSERT_OK_AND_ASSIGN(auto view, buffered_->Peek(3));
EXPECT_EQ(view, kExample1.substr(9, 3));
ASSERT_EQ(3, buffered_->bytes_buffered());
ASSERT_EQ(12, buffered_->buffer_size());
ASSERT_OK_AND_ASSIGN(view, buffered_->Peek(10));
// Peek() cannot go past the `raw_read_bound`
EXPECT_EQ(view, kExample1.substr(9, 6));
ASSERT_EQ(6, buffered_->bytes_buffered());
ASSERT_EQ(15, buffered_->buffer_size());
// Do read
ASSERT_OK_AND_ASSIGN(bytes, buffered_->Read(6));
EXPECT_EQ(std::string_view(*bytes), kExample1.substr(9, 6));
ASSERT_EQ(0, buffered_->bytes_buffered());
}

class TestBufferedInputStreamBound : public ::testing::Test {
public:
void SetUp() { CreateExample(/*bounded=*/true); }
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/vendored/datetime.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "datetime/visibility.h"
#include "datetime/tz.cpp"
7 changes: 4 additions & 3 deletions cpp/src/arrow/vendored/datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@

#pragma once

#include "arrow/vendored/datetime/date.h" // IWYU pragma: export
#include "arrow/vendored/datetime/tz.h" // IWYU pragma: export
#include "arrow/vendored/datetime/visibility.h" // IWYU pragma: export
#include "arrow/vendored/datetime/date.h" // IWYU pragma: export
#include "arrow/vendored/datetime/tz.h" // IWYU pragma: export

// Can be defined by date.h.
#ifdef NOEXCEPT
#undef NOEXCEPT
# undef NOEXCEPT
#endif
12 changes: 8 additions & 4 deletions cpp/src/arrow/vendored/datetime/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,16 @@ copies or substantial portions of the Software.
Sources for datetime are adapted from Howard Hinnant's date library
(https://github.com/HowardHinnant/date).

Sources are taken from changeset 1ead6715dec030d340a316c927c877a3c4e5a00c
Sources are taken from changeset 5bdb7e6f31fac909c090a46dbd9fea27b6e609a4
of the above project.

The following changes are made:
- fix internal inclusion paths (from "date/xxx.h" to simply "xxx.h")
- enclose the `date` namespace inside the `arrow_vendored` namespace
- include a custom "visibility.h" header from "tz.cpp" for proper DLL
exports on Windows
- disable curl-based database downloading in "tz.h"

## How to update

```console
$ cd cpp/src/arrow/vendored/datetime
$ ./update.sh 3.0.3
```
7 changes: 2 additions & 5 deletions cpp/src/arrow/vendored/datetime/date.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@
# pragma warning(disable : 4127)
#endif

namespace arrow_vendored
{
namespace date
namespace arrow_vendored::date
{

//---------------+
Expand Down Expand Up @@ -8234,8 +8232,7 @@ operator<<(std::basic_ostream<CharT, Traits>& os,
detail::get_units<CharT>(typename Period::type{});
}

} // namespace date
} // namespace arrow_vendored
} // namespace arrow_vendored::date

#ifdef _MSC_VER
# pragma warning(pop)
Expand Down
7 changes: 2 additions & 5 deletions cpp/src/arrow/vendored/datetime/ios.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
# if TARGET_OS_IPHONE
# include <string>

namespace arrow_vendored
{
namespace date
namespace arrow_vendored::date
{
namespace iOSUtils
{
Expand All @@ -43,8 +41,7 @@
std::string get_current_timezone();

} // namespace iOSUtils
} // namespace date
} // namespace arrow_vendored
} // namespace arrow_vendored::date

# endif // TARGET_OS_IPHONE
#else // !__APPLE__
Expand Down
7 changes: 2 additions & 5 deletions cpp/src/arrow/vendored/datetime/ios.mm
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@
#define TAR_SIZE_POSITION 124
#define TAR_SIZE_SIZE 12

namespace arrow_vendored
{
namespace date
namespace arrow_vendored::date
{
namespace iOSUtils
{
Expand Down Expand Up @@ -334,7 +332,6 @@ bool writeFile(const std::string &tzdataPath, const std::string &fileName,
}

} // namespace iOSUtils
} // namespace date
} // namespace arrow_vendored
} // namespace arrow_vendored::date

#endif // TARGET_OS_IPHONE
Loading

0 comments on commit de5a11c

Please sign in to comment.