From 9ce3a36c25cabf46c77bd55e132e938cfef2e339 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 12:16:56 -0500 Subject: [PATCH 01/13] use ben's integration tests for job --- ci/docker/integration.dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/docker/integration.dockerfile b/ci/docker/integration.dockerfile index ac7c77164..d28e707c3 100644 --- a/ci/docker/integration.dockerfile +++ b/ci/docker/integration.dockerfile @@ -42,7 +42,8 @@ ENV ARROW_RUST_EXE_PATH=/build/rust/debug ENV BUILD_DOCS_CPP=OFF # Clone the arrow monorepo -RUN git clone https://github.com/apache/arrow.git /arrow-integration --recurse-submodules +RUN git clone https://github.com/bkietz/arrow.git /arrow-integration --recurse-submodules && \ + cd /arrow-integration && git switch nanoarrow-integration-tests # Clone the arrow-rs repo RUN git clone https://github.com/apache/arrow-rs /arrow-integration/rust From 030b29bf20aa2570c25fa860038cebd8d2780624 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 13:58:58 -0500 Subject: [PATCH 02/13] only validate strictly required offsets --- src/nanoarrow/common/array.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 6446d4b03..eb1ab8fda 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -1249,13 +1249,23 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { + // Only validate the portion of the buffer that is strictly required, + // which includes not validating the offset buffer of a zero-length array. case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - if (array_view->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); - } else { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); + if (array_view->layout.element_size_bits[i] == 32 && array_view->length > 0) { + struct ArrowBufferView offset_minimal; + offset_minimal.data.as_int32 = + array_view->buffer_views[i].data.as_int32 + array_view->offset; + offset_minimal.size_bytes = + (array_view->offset + array_view->length + 1) * sizeof(int32_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(offset_minimal, error)); + } else if (array_view->length > 0) { + struct ArrowBufferView offset_minimal; + offset_minimal.data.as_int64 = + array_view->buffer_views[i].data.as_int64 + array_view->offset; + offset_minimal.size_bytes = + (array_view->offset + array_view->length + 1) * sizeof(int64_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(offset_minimal, error)); } break; default: From 20dc619deb9a8f0e12c730dbc9b6ad8012f3e5cb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 15:14:27 -0500 Subject: [PATCH 03/13] run integration workflow on dockerfile change --- .github/workflows/integration.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 8a2e9d638..0d7e3e879 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -25,6 +25,7 @@ on: pull_request: paths: - .github/workflows/integration.yaml + - ci/docker/integration.dockerfile - docker-compose.yml schedule: - cron: '5 0 * * 0' From 44d2b3972de689b203a9eab76f9774894d0a12f7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 15:26:44 -0500 Subject: [PATCH 04/13] fix byte sizes --- src/nanoarrow/common/array.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index eb1ab8fda..ca4b8841e 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -1256,15 +1256,13 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowBufferView offset_minimal; offset_minimal.data.as_int32 = array_view->buffer_views[i].data.as_int32 + array_view->offset; - offset_minimal.size_bytes = - (array_view->offset + array_view->length + 1) * sizeof(int32_t); + offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int32_t); NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(offset_minimal, error)); } else if (array_view->length > 0) { struct ArrowBufferView offset_minimal; offset_minimal.data.as_int64 = array_view->buffer_views[i].data.as_int64 + array_view->offset; - offset_minimal.size_bytes = - (array_view->offset + array_view->length + 1) * sizeof(int64_t); + offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int64_t); NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(offset_minimal, error)); } break; From 09645c55dc8f7963ba2f17981617f7f37dc60f7c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 15:49:45 -0500 Subject: [PATCH 05/13] check sliced offsets --- src/nanoarrow/common/array.c | 8 ++++---- src/nanoarrow/common/array_test.cc | 26 +++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index ca4b8841e..11892c927 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -990,7 +990,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, first_offset); @@ -1021,7 +1021,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, first_offset); @@ -1065,7 +1065,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, first_offset); @@ -1087,7 +1087,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, first_offset); diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index dec7366fa..18c045077 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2138,6 +2138,10 @@ TEST(ArrayTest, ArrayViewTestString) { EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0); EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0); + // This should pass validation even if all buffers are empty + ASSERT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayViewSetLength(&array_view, 5); EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int32_t)); @@ -2192,15 +2196,35 @@ TEST(ArrayTest, ArrayViewTestString) { offsets[0] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); - offsets[0] = 0; + // For a sliced array, this can still pass validation + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Check for negative element sizes + array.offset = 0; + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + // Sliced array should also fail validation because the first element is negative + array.offset = 0; + array.length = array_view.length + 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + // Check sequential offsets whose diff causes overflow + array.offset = 0; + array.length = array.length + 1; offsets[1] = 2080374784; offsets[2] = INT_MIN; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); From f6a2b36a4ae0bd7af3b2ac529b9e11eb6c8446ae Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 15:55:53 -0500 Subject: [PATCH 06/13] test large string --- src/nanoarrow/common/array_test.cc | 36 ++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 18c045077..53a8e6ae3 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2256,6 +2256,10 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0); EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0); + // This should pass validation even if all buffers are empty + ASSERT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayViewSetLength(&array_view, 5); EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int64_t)); @@ -2304,17 +2308,41 @@ TEST(ArrayTest, ArrayViewTestLargeString) { int64_t* offsets = const_cast(reinterpret_cast(array.buffers[1])); - offsets[0] = -1; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); - offsets[0] = 0; + // For a sliced array, this can still pass validation + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + // Check for negative element sizes + array.offset = 0; + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + // Sliced array should also fail validation because the first element is negative + array.offset = 0; + array.length = array_view.length + 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + + // Check sequential offsets whose diff causes overflow + array.offset = 0; + array.length = array.length + 1; + offsets[1] = 2080374784; + offsets[2] = INT_MIN; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[2] Expected element size >= 0"); + ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } From 04e350c0b993f421e7d0225268f9a7b20900fd13 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 16:16:45 -0500 Subject: [PATCH 07/13] more tests --- src/nanoarrow/common/array_test.cc | 80 +++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 53a8e6ae3..970a9f41d 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2139,7 +2139,7 @@ TEST(ArrayTest, ArrayViewTestString) { EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0); // This should pass validation even if all buffers are empty - ASSERT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), NANOARROW_OK); ArrowArrayViewSetLength(&array_view, 5); @@ -2257,7 +2257,7 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0); // This should pass validation even if all buffers are empty - ASSERT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), NANOARROW_OK); ArrowArrayViewSetLength(&array_view, 5); @@ -2308,6 +2308,10 @@ TEST(ArrayTest, ArrayViewTestLargeString) { int64_t* offsets = const_cast(reinterpret_cast(array.buffers[1])); + offsets[0] = -1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); + // For a sliced array, this can still pass validation array.offset = 1; array.length = array_view.length - 1; @@ -2382,6 +2386,7 @@ TEST(ArrayTest, ArrayViewTestStruct) { TEST(ArrayTest, ArrayViewTestList) { struct ArrowArrayView array_view; + struct ArrowError error; ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_LIST); EXPECT_EQ(array_view.array, nullptr); @@ -2396,6 +2401,15 @@ TEST(ArrayTest, ArrayViewTestList) { ArrowArrayViewInitFromType(array_view.children[0], NANOARROW_TYPE_INT32); EXPECT_EQ(array_view.children[0]->storage_type, NANOARROW_TYPE_INT32); + // Can't assume the offsets buffer exists for length == 0 + ArrowArrayViewSetLength(&array_view, 0); + EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0); + EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0); + + // This should pass validation even if all buffers are empty + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayViewSetLength(&array_view, 5); EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int32_t)); @@ -2416,21 +2430,48 @@ TEST(ArrayTest, ArrayViewTestList) { NANOARROW_OK); // Expect error for offsets that will cause bad access - struct ArrowError error; int32_t* offsets = const_cast(reinterpret_cast(array.buffers[1])); offsets[0] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); - offsets[0] = 0; + // For a sliced array, this can still pass validation + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Check for negative element sizes + array.offset = 0; + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + // Sliced array should also fail validation because the first element is negative + array.offset = 0; + array.length = array_view.length + 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + + // Check sequential offsets whose diff causes overflow + array.offset = 0; + array.length = array.length + 1; + offsets[1] = 2080374784; + offsets[2] = INT_MIN; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[2] Expected element size >= 0"); + ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } @@ -2503,6 +2544,7 @@ TEST(ArrayTest, ArrayViewTestLargeListGet) { TEST(ArrayTest, ArrayViewTestLargeList) { struct ArrowArrayView array_view; + struct ArrowError error; ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_LARGE_LIST); EXPECT_EQ(array_view.array, nullptr); @@ -2517,6 +2559,15 @@ TEST(ArrayTest, ArrayViewTestLargeList) { ArrowArrayViewInitFromType(array_view.children[0], NANOARROW_TYPE_INT32); EXPECT_EQ(array_view.children[0]->storage_type, NANOARROW_TYPE_INT32); + // Can't assume the offsets buffer exists for length == 0 + ArrowArrayViewSetLength(&array_view, 0); + EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0); + EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0); + + // This should pass validation even if all buffers are empty + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayViewSetLength(&array_view, 5); EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int64_t)); @@ -2537,21 +2588,38 @@ TEST(ArrayTest, ArrayViewTestLargeList) { NANOARROW_OK); // Expect error for offsets that will cause bad access - struct ArrowError error; int64_t* offsets = const_cast(reinterpret_cast(array.buffers[1])); offsets[0] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); - offsets[0] = 0; + // For a sliced array, this can still pass validation + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Check for negative element sizes + array.offset = 0; + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + // Sliced array should also fail validation because the first element is negative + array.offset = 0; + array.length = array_view.length + 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } From 6700f219339e0c23880d7e6cfdfb3c52c3933ba8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 16:23:18 -0500 Subject: [PATCH 08/13] remove non applicable section --- src/nanoarrow/common/array_test.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 970a9f41d..247ac5dbf 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2337,16 +2337,6 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EINVAL); EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); - // Check sequential offsets whose diff causes overflow - array.offset = 0; - array.length = array.length + 1; - offsets[1] = 2080374784; - offsets[2] = INT_MIN; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[2] Expected element size >= 0"); - ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } From f11600d392e1952f94bb5b7b98b0b06a6d26d0e8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 16:57:15 -0500 Subject: [PATCH 09/13] maybe fix tests again --- src/nanoarrow/common/array.c | 22 ++++++++++ src/nanoarrow/common/array_test.cc | 70 +++++++++++++++--------------- 2 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 11892c927..5842712f8 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -998,6 +998,11 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { @@ -1029,6 +1034,11 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { @@ -1073,6 +1083,12 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + if (array_view->children[0]->length < last_offset) { ArrowErrorSet(error, "Expected child of %s array to have length >= %" PRId64 @@ -1095,6 +1111,12 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + if (array_view->children[0]->length < last_offset) { ArrowErrorSet(error, "Expected child of large list array to have length >= %" PRId64 diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 247ac5dbf..1d84abaa1 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2215,22 +2215,20 @@ TEST(ArrayTest, ArrayViewTestString) { EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); // Sliced array should also fail validation because the first element is negative - array.offset = 0; - array.length = array_view.length + 1; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); // Check sequential offsets whose diff causes overflow array.offset = 0; array.length = array.length + 1; - offsets[1] = 2080374784; - offsets[2] = INT_MIN; + offsets[0] = 2080374784; + offsets[1] = INT_MIN; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); - EXPECT_STREQ(error.message, "[2] Expected element size >= 0"); + EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); @@ -2330,12 +2328,10 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); // Sliced array should also fail validation because the first element is negative - array.offset = 0; - array.length = array_view.length + 1; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); @@ -2404,7 +2400,7 @@ TEST(ArrayTest, ArrayViewTestList) { EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int32_t)); - // Build a valid array + // Build a valid array ([[1234], []]) struct ArrowArray array; ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_LIST), NANOARROW_OK); ASSERT_EQ(ArrowArrayAllocateChildren(&array, 1), NANOARROW_OK); @@ -2413,6 +2409,7 @@ TEST(ArrayTest, ArrayViewTestList) { ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK); ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 1234), NANOARROW_OK); ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK); + ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK); ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK); @@ -2445,22 +2442,18 @@ TEST(ArrayTest, ArrayViewTestList) { EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); // Sliced array should also fail validation because the first element is negative - array.offset = 0; - array.length = array_view.length + 1; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); - // Check sequential offsets whose diff causes overflow + // Check for last element >= 0 array.offset = 0; array.length = array.length + 1; - offsets[1] = 2080374784; - offsets[2] = INT_MIN; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[2] Expected element size >= 0"); + offsets[1] = 1; + offsets[2] = -1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); @@ -2562,7 +2555,7 @@ TEST(ArrayTest, ArrayViewTestLargeList) { EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1); EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int64_t)); - // Build a valid array + // Build a valid array ([[1234], []]) struct ArrowArray array; ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_LARGE_LIST), NANOARROW_OK); ASSERT_EQ(ArrowArrayAllocateChildren(&array, 1), NANOARROW_OK); @@ -2571,6 +2564,7 @@ TEST(ArrayTest, ArrayViewTestLargeList) { ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK); ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 1234), NANOARROW_OK); ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK); + ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK); ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK); @@ -2603,12 +2597,18 @@ TEST(ArrayTest, ArrayViewTestLargeList) { EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); // Sliced array should also fail validation because the first element is negative + array.offset = 1; + array.length = array_view.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1"); + + // Check for last element >= 0 array.offset = 0; - array.length = array_view.length + 1; - EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); - EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), - EINVAL); - EXPECT_STREQ(error.message, "[1] Expected element size >= 0"); + array.length = array.length + 1; + offsets[1] = 1; + offsets[2] = -1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); From b95bb3706850503dd339e7d72f27479f75305d48 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 21:39:07 -0500 Subject: [PATCH 10/13] Update src/nanoarrow/common/array.c Co-authored-by: Benjamin Kietzman --- src/nanoarrow/common/array.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 5842712f8..b033bb05b 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -1274,13 +1274,16 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, // Only validate the portion of the buffer that is strictly required, // which includes not validating the offset buffer of a zero-length array. case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - if (array_view->layout.element_size_bits[i] == 32 && array_view->length > 0) { + if (array_view->length == 0) { + continue; + } + if (array_view->layout.element_size_bits[i] == 32) { struct ArrowBufferView offset_minimal; offset_minimal.data.as_int32 = array_view->buffer_views[i].data.as_int32 + array_view->offset; offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int32_t); NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(offset_minimal, error)); - } else if (array_view->length > 0) { + } else { struct ArrowBufferView offset_minimal; offset_minimal.data.as_int64 = array_view->buffer_views[i].data.as_int64 + array_view->offset; From 87c4a97a676a9726565064ca65410eb9df9201cd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 21:49:41 -0500 Subject: [PATCH 11/13] fix integration docker --- ci/docker/integration.dockerfile | 3 +-- docker-compose.yml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ci/docker/integration.dockerfile b/ci/docker/integration.dockerfile index d28e707c3..ac7c77164 100644 --- a/ci/docker/integration.dockerfile +++ b/ci/docker/integration.dockerfile @@ -42,8 +42,7 @@ ENV ARROW_RUST_EXE_PATH=/build/rust/debug ENV BUILD_DOCS_CPP=OFF # Clone the arrow monorepo -RUN git clone https://github.com/bkietz/arrow.git /arrow-integration --recurse-submodules && \ - cd /arrow-integration && git switch nanoarrow-integration-tests +RUN git clone https://github.com/apache/arrow.git /arrow-integration --recurse-submodules # Clone the arrow-rs repo RUN git clone https://github.com/apache/arrow-rs /arrow-integration/rust diff --git a/docker-compose.yml b/docker-compose.yml index 2fb86c935..bd651cb7f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,7 +54,7 @@ services: volumes: - ${NANOARROW_DOCKER_SOURCE_DIR}:/arrow-integration/nanoarrow environment: - ARCHERY_INTEGRATION_TARGET_LANGUAGES: "nanoarrow" + ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS: "nanoarrow" command: ["echo '::group::Build nanoarrow' && conda run --no-capture-output /arrow-integration/ci/scripts/nanoarrow_build.sh /arrow-integration /build && From 44141da07e99ba489555685bf66f422f34b76fc1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 23:42:07 -0500 Subject: [PATCH 12/13] test more offset things --- src/nanoarrow/common/array.c | 2 +- src/nanoarrow/common/array_test.cc | 56 ++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index b033bb05b..72cb52033 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -1034,7 +1034,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; - if (first_offset < 0) { + if (last_offset < 0) { ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, last_offset); return EINVAL; diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 1d84abaa1..39f1352bc 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2204,11 +2204,26 @@ TEST(ArrayTest, ArrayViewTestString) { EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), NANOARROW_OK); - // Check for negative element sizes + // Check for last element >= 0 array.offset = 0; array.length = array.length + 1; offsets[0] = 0; + offsets[1] = 1; + offsets[2] = -1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); + + // ...but the array should be valid if we make it one element shorter + array.length = array.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Check for negative element sizes + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; + offsets[2] = 2; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); @@ -2287,20 +2302,22 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0); EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0); - // Build non-zero length (the array ["abcd"]) + // Build non-zero length (the array ["abcd", "efg"]) ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 0), NANOARROW_OK); ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 4), NANOARROW_OK); + ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 7), NANOARROW_OK); ASSERT_EQ(ArrowBufferReserve(ArrowArrayBuffer(&array, 2), 4), NANOARROW_OK); ArrowBufferAppendUnsafe(ArrowArrayBuffer(&array, 2), "abcd", 4); - array.length = 1; + ArrowBufferAppendUnsafe(ArrowArrayBuffer(&array, 2), "efg", 3); + array.length = 2; ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), NANOARROW_OK); EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0); - EXPECT_EQ(array_view.buffer_views[1].size_bytes, (1 + 1) * sizeof(int64_t)); - EXPECT_EQ(array_view.buffer_views[2].size_bytes, 4); + EXPECT_EQ(array_view.buffer_views[1].size_bytes, (1 + 2) * sizeof(int64_t)); + EXPECT_EQ(array_view.buffer_views[2].size_bytes, 7); // Expect error for offsets that will cause bad access int64_t* offsets = @@ -2317,11 +2334,26 @@ TEST(ArrayTest, ArrayViewTestLargeString) { EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), NANOARROW_OK); - // Check for negative element sizes + // Check for last element >= 0 array.offset = 0; array.length = array.length + 1; offsets[0] = 0; + offsets[1] = 1; + offsets[2] = -1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); + EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); + + // ...but the array should be valid if we make it one element shorter + array.length = array.length - 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Check for negative element sizes + array.length = array.length + 1; + offsets[0] = 0; offsets[1] = -1; + offsets[2] = 2; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), EINVAL); @@ -2455,6 +2487,12 @@ TEST(ArrayTest, ArrayViewTestList) { EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); + // ...but the array should be valid if we make it one element shorter + array.length = 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } @@ -2610,6 +2648,12 @@ TEST(ArrayTest, ArrayViewTestLargeList) { EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); EXPECT_STREQ(error.message, "Expected last offset >= 0 but found -1"); + // ...but the array should be valid if we make it one element shorter + array.length = 1; + EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + ArrowArrayRelease(&array); ArrowArrayViewReset(&array_view); } From 69ceefbc7d86fbadec033b4247ffc522b7599575 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Sep 2024 23:54:31 -0500 Subject: [PATCH 13/13] fix sanitizer --- src/nanoarrow/common/array_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 39f1352bc..eb770d4fd 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -2306,7 +2306,7 @@ TEST(ArrayTest, ArrayViewTestLargeString) { ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 0), NANOARROW_OK); ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 4), NANOARROW_OK); ASSERT_EQ(ArrowBufferAppendInt64(ArrowArrayBuffer(&array, 1), 7), NANOARROW_OK); - ASSERT_EQ(ArrowBufferReserve(ArrowArrayBuffer(&array, 2), 4), NANOARROW_OK); + ASSERT_EQ(ArrowBufferReserve(ArrowArrayBuffer(&array, 2), 7), NANOARROW_OK); ArrowBufferAppendUnsafe(ArrowArrayBuffer(&array, 2), "abcd", 4); ArrowBufferAppendUnsafe(ArrowArrayBuffer(&array, 2), "efg", 3); array.length = 2;