Skip to content

Commit

Permalink
feat: Add ArrowArrayView accessors to inspect buffer properties (#638)
Browse files Browse the repository at this point in the history
This PR abstracts accessors for the buffer_view, buffer type, buffer
data type, and element bit width for the `ArrowArrayView`. Before adding
string/binary view support, this was done by directly accessing the
`layout` and `buffer_view` members; however, this required
special-casing + some duplicated code in the string view in the R/Python
bindings.

This PR also removes the dependence on the `ArrowArrayView::array`
member, since this member is optional (i.e., the data backing an
`ArrowArrayView` need not be related to an actual `ArrowArray`).
  • Loading branch information
paleolimbot authored Oct 1, 2024
1 parent 5b98b3d commit e52ff0d
Show file tree
Hide file tree
Showing 13 changed files with 381 additions and 167 deletions.
55 changes: 19 additions & 36 deletions python/src/nanoarrow/_array.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ from nanoarrow_c cimport (
ArrowArrayViewComputeNullCount,
ArrowArrayViewInitFromSchema,
ArrowArrayViewIsNull,
ArrowArrayViewGetStringUnsafe,
ArrowArrayViewGetBytesUnsafe,
ArrowArrayViewGetBufferDataType,
ArrowArrayViewGetBufferElementSizeBits,
ArrowArrayViewGetBufferType,
ArrowArrayViewGetBufferView,
ArrowArrayViewGetNumBuffers,
ArrowArrayViewGetStringUnsafe,
ArrowArrayViewSetArray,
ArrowArrayViewSetArrayMinimal,
ArrowBitCountSet,
Expand All @@ -62,7 +67,8 @@ from nanoarrow_c cimport (
ArrowValidationLevel,
NANOARROW_BUFFER_TYPE_DATA,
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
NANOARROW_BUFFER_TYPE_DATA_VIEW,
NANOARROW_BUFFER_TYPE_VARIADIC_DATA,
NANOARROW_BUFFER_TYPE_VARIADIC_SIZE,
NANOARROW_BUFFER_TYPE_TYPE_ID,
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
NANOARROW_BUFFER_TYPE_VALIDITY,
Expand All @@ -84,7 +90,6 @@ from nanoarrow._device cimport Device, CSharedSyncEvent

from nanoarrow._buffer cimport CBuffer, CBufferView
from nanoarrow._schema cimport CSchema, CLayout
from nanoarrow cimport _types
from nanoarrow._utils cimport (
alloc_c_array,
alloc_c_device_array,
Expand Down Expand Up @@ -196,44 +201,20 @@ cdef class CArrayView:

@property
def n_buffers(self):
if _types.is_data_view(self._ptr.storage_type):
return 2 + self._ptr.n_variadic_buffers + 1

return self.layout.n_buffers
return ArrowArrayViewGetNumBuffers(self._ptr)

def _buffer_info(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")

if (
_types.is_data_view(self._ptr.storage_type)
and i == (2 + self._ptr.n_variadic_buffers)
):
return (
NANOARROW_BUFFER_TYPE_DATA,
_types.INT64,
64,
<uintptr_t>self._ptr.array.buffers[i],
(self._ptr.n_variadic_buffers) * 8
)
elif (
_types.is_data_view(self._ptr.storage_type)
and i >= 2
):
return (
NANOARROW_BUFFER_TYPE_DATA,
_types.STRING if int(self._ptr.storage_type) == _types.STRING_VIEW else _types.BINARY,
0,
<uintptr_t>self._ptr.array.buffers[i],
(<int64_t*>self._ptr.array.buffers[2 + self._ptr.n_variadic_buffers])[i - 2]
)
cdef ArrowBufferView view = ArrowArrayViewGetBufferView(self._ptr, i)

return (
self._ptr.layout.buffer_type[i],
self._ptr.layout.buffer_data_type[i],
self._ptr.layout.element_size_bits[i],
<uintptr_t>self._ptr.buffer_views[i].data.data,
self._ptr.buffer_views[i].size_bytes
ArrowArrayViewGetBufferType(self._ptr, i),
ArrowArrayViewGetBufferDataType(self._ptr, i),
ArrowArrayViewGetBufferElementSizeBits(self._ptr, i),
<uintptr_t>view.data.data,
view.size_bytes
)

def buffer_type(self, int64_t i):
Expand All @@ -248,8 +229,10 @@ cdef class CArrayView:
return "data_offset"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
return "data"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW:
return "data_view"
elif buffer_type == NANOARROW_BUFFER_TYPE_VARIADIC_DATA:
return "variadic_data"
elif buffer_type == NANOARROW_BUFFER_TYPE_VARIADIC_SIZE:
return "variadic_size"
else:
return "none"

Expand Down
2 changes: 1 addition & 1 deletion python/tests/test_c_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def test_c_array_from_iterable_bytes():
na.c_array([buf_2d], na.binary())


def test_c_array_from_iterable__view():
def test_c_array_from_iterable_view():
string = na.c_array(
[b"abc", None, b"a string longer than 12 bytes"], na.binary_view()
)
Expand Down
36 changes: 4 additions & 32 deletions r/src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,38 +370,10 @@ static SEXP borrow_buffer(struct ArrowArrayView* array_view, int64_t i, SEXP she
SEXP buffer_class = PROTECT(Rf_allocVector(STRSXP, 2));
SET_STRING_ELT(buffer_class, 1, Rf_mkChar("nanoarrow_buffer"));

struct ArrowBufferView view;
enum ArrowBufferType buffer_type;
enum ArrowType data_type;
int64_t element_size_bits;
if ((array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) &&
i >= NANOARROW_BINARY_VIEW_FIXED_BUFFERS) {
view.data.data = array_view->array->buffers[i];

if (i == (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) {
view.size_bytes = array_view->n_variadic_buffers * sizeof(int64_t);
buffer_type = NANOARROW_BUFFER_TYPE_DATA;
data_type = NANOARROW_TYPE_INT64;
element_size_bits = 64;
} else {
view.size_bytes =
array_view->variadic_buffer_sizes[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS];
buffer_type = NANOARROW_BUFFER_TYPE_DATA;

if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW) {
data_type = NANOARROW_TYPE_STRING;
} else {
data_type = NANOARROW_TYPE_BINARY;
}
element_size_bits = 0;
}
} else {
view = array_view->buffer_views[i];
buffer_type = array_view->layout.buffer_type[i];
data_type = array_view->layout.buffer_data_type[i];
element_size_bits = array_view->layout.element_size_bits[i];
}
struct ArrowBufferView view = ArrowArrayViewGetBufferView(array_view, i);
enum ArrowBufferType buffer_type = ArrowArrayViewGetBufferType(array_view, i);
enum ArrowType data_type = ArrowArrayViewGetBufferDataType(array_view, i);
int64_t element_size_bits = ArrowArrayViewGetBufferElementSizeBits(array_view, i);

SEXP buffer_xptr =
PROTECT(buffer_borrowed_xptr(view.data.data, view.size_bytes, shelter));
Expand Down
7 changes: 5 additions & 2 deletions r/src/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,11 @@ SEXP nanoarrow_c_buffer_info(SEXP buffer_xptr) {
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
buffer_type_string = "union_offset";
break;
case NANOARROW_BUFFER_TYPE_DATA_VIEW:
buffer_type_string = "data_view";
case NANOARROW_BUFFER_TYPE_VARIADIC_DATA:
buffer_type_string = "variadic_data";
break;
case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE:
buffer_type_string = "variadic_size";
break;
default:
buffer_type_string = "unknown";
Expand Down
10 changes: 5 additions & 5 deletions r/tests/testthat/_snaps/array.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
$ offset : int 0
$ buffers :List of 3
..$ :<nanoarrow_buffer validity<bool>[null] ``
..$ :<nanoarrow_buffer data_view<string_view>[26][416 b]>`
..$ :<nanoarrow_buffer data<int64>[null] ``
..$ :<nanoarrow_buffer data<string_view>[26][416 b]>`
..$ :<nanoarrow_buffer variadic_size<int64>[null] ``
$ dictionary: NULL
$ children : list()

Expand All @@ -25,9 +25,9 @@
$ offset : int 0
$ buffers :List of 4
..$ :<nanoarrow_buffer validity<bool>[null] ``
..$ :<nanoarrow_buffer data_view<string_view>[1][16 b]>`
..$ :<nanoarrow_buffer data<string>[35 b]> `this string is longer than 12 ...`
..$ :<nanoarrow_buffer data<int64>[1][8 b]> `35`
..$ :<nanoarrow_buffer data<string_view>[1][16 b]>`
..$ :<nanoarrow_buffer variadic_data<string>[35 b]> `this string is longer...`
..$ :<nanoarrow_buffer variadic_size<int64>[1][8 b]> `35`
$ dictionary: NULL
$ children : list()

8 changes: 6 additions & 2 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -696,11 +696,12 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length)
_ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) /
8;
continue;
case NANOARROW_BUFFER_TYPE_DATA_VIEW:
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
continue;
case NANOARROW_BUFFER_TYPE_VARIADIC_DATA:
case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE:
case NANOARROW_BUFFER_TYPE_NONE:
array_view->buffer_views[i].size_bytes = 0;
continue;
Expand Down Expand Up @@ -734,6 +735,7 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
array_view->length = array->length;
array_view->null_count = array->null_count;
array_view->variadic_buffer_sizes = NULL;
array_view->variadic_buffers = NULL;
array_view->n_variadic_buffers = 0;

int64_t buffers_required = 0;
Expand Down Expand Up @@ -767,6 +769,7 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1);
array_view->n_variadic_buffers = nvariadic_buf;
buffers_required += nvariadic_buf + 1;
array_view->variadic_buffers = array->buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
}

Expand Down Expand Up @@ -863,9 +866,10 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
break;
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
case NANOARROW_BUFFER_TYPE_DATA_VIEW:
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
break;
case NANOARROW_BUFFER_TYPE_VARIADIC_DATA:
case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE:
case NANOARROW_BUFFER_TYPE_NONE:
continue;
}
Expand Down
Loading

0 comments on commit e52ff0d

Please sign in to comment.