Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Improve validation of offset buffers for sliced arrays #626

Merged
merged 13 commits into from
Sep 20, 2024
1 change: 1 addition & 0 deletions .github/workflows/integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ on:
pull_request:
paths:
- .github/workflows/integration.yaml
- ci/docker/integration.dockerfile
- docker-compose.yml
schedule:
- cron: '5 0 * * 0'
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ services:
volumes:
- ${NANOARROW_DOCKER_SOURCE_DIR}:/arrow-integration/nanoarrow
environment:
ARCHERY_INTEGRATION_TARGET_LANGUAGES: "nanoarrow"
ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS: "nanoarrow"
command:
["echo '::group::Build nanoarrow' &&
conda run --no-capture-output /arrow-integration/ci/scripts/nanoarrow_build.sh /arrow-integration /build &&
Expand Down
49 changes: 41 additions & 8 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -990,14 +990,19 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_BINARY:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int32[0];
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
first_offset);
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
return EINVAL;
}

// If the data buffer size is unknown, assign it; otherwise, check it
if (array_view->buffer_views[2].size_bytes == -1) {
Expand All @@ -1021,14 +1026,19 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_LARGE_BINARY:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int64[0];
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
first_offset);
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
return EINVAL;
}

// If the data buffer size is unknown, assign it; otherwise, check it
if (array_view->buffer_views[2].size_bytes == -1) {
Expand Down Expand Up @@ -1065,14 +1075,20 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_MAP:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int32[0];
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
first_offset);
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
return EINVAL;
}

if (array_view->children[0]->length < last_offset) {
ArrowErrorSet(error,
"Expected child of %s array to have length >= %" PRId64
Expand All @@ -1087,14 +1103,20 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,

case NANOARROW_TYPE_LARGE_LIST:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int64[0];
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
first_offset);
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
return EINVAL;
}

if (array_view->children[0]->length < last_offset) {
ArrowErrorSet(error,
"Expected child of large list array to have length >= %" PRId64
Expand Down Expand Up @@ -1249,13 +1271,24 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
struct ArrowError* error) {
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
switch (array_view->layout.buffer_type[i]) {
// Only validate the portion of the buffer that is strictly required,
// which includes not validating the offset buffer of a zero-length array.
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
if (array_view->length == 0) {
continue;
}
if (array_view->layout.element_size_bits[i] == 32) {
NANOARROW_RETURN_NOT_OK(
ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
struct ArrowBufferView offset_minimal;
offset_minimal.data.as_int32 =
array_view->buffer_views[i].data.as_int32 + array_view->offset;
offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int32_t);
NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(offset_minimal, error));
} else {
NANOARROW_RETURN_NOT_OK(
ArrowAssertIncreasingInt64(array_view->buffer_views[i], error));
struct ArrowBufferView offset_minimal;
offset_minimal.data.as_int64 =
array_view->buffer_views[i].data.as_int64 + array_view->offset;
offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int64_t);
NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(offset_minimal, error));
}
break;
default:
Expand Down
Loading
Loading