From b63e601e9bcc7ef600fed8a80fc11632f023589f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 18:32:41 +0200 Subject: [PATCH 1/4] ENH: add basic DataFrame.from_arrow class method for importing through Arrow PyCapsule interface --- pandas/core/frame.py | 35 +++++++++++++++++++ pandas/tests/frame/test_arrow_interface.py | 40 ++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f47acf579d79c..0f57e5d50190e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1746,6 +1746,41 @@ def __rmatmul__(self, other) -> DataFrame: # ---------------------------------------------------------------------- # IO methods (to / from other formats) + @classmethod + def from_arrow(cls, data): + """ + Construct a DataFrame from a tabular Arrow object. + + This function accepts any tabular Arrow object implementing + the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__`` + or ``__arrow_c_stream__`` method). + + This function currently relies on ``pyarrow`` to convert the tabular + object in Arrow format to pandas. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + .. versionadded:: 3.0 + + Parameters + ---------- + data : pyarrow.Table or Arrow-compatible table + Any tabular object implementing the Arrow PyCapsule Protocol + (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` + method). + + Returns + ------- + DataFrame + + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if not isinstance(data, pa.Table): + data = pa.table(data) + + df = data.to_pandas() + return df + @classmethod def from_dict( cls, diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index dc163268f64b9..7bef9d81ad5bb 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -7,6 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd +import pandas._testing as tm pa = pytest.importorskip("pyarrow") @@ -47,3 +48,42 @@ def test_dataframe_to_arrow(): table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) + + +class ArrowArrayWrapper: + def __init__(self, batch): + self.array = batch + + def __arrow_c_array__(self, requested_schema=None): + return self.array.__arrow_c_array__(requested_schema) + + +class ArrowStreamWrapper: + def __init__(self, table): + self.stream = table + + def __arrow_c_stream__(self, requested_schema=None): + return self.stream.__arrow_c_stream__(requested_schema) + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_from_arrow(): + # objects with __arrow_c_stream__ + table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + result = pd.DataFrame.from_arrow(table) + expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + tm.assert_frame_equal(result, expected) + + # not only pyarrow object are supported + result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table)) + tm.assert_frame_equal(result, expected) + + # objects with __arrow_c_array__ + batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"]) + + result = pd.DataFrame.from_arrow(table) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch)) + tm.assert_frame_equal(result, expected) From 6901e6d8155dd640e63d57f864bc0b553b0ecfb1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 18:39:09 +0200 Subject: [PATCH 2/4] add validation --- pandas/core/frame.py | 11 +++++++++++ pandas/tests/frame/test_arrow_interface.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f57e5d50190e..241de5cfbde76 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1776,6 +1776,17 @@ def from_arrow(cls, data): """ pa = import_optional_dependency("pyarrow", min_version="14.0.0") if not isinstance(data, pa.Table): + if not ( + hasattr(data, "__arrow_c_array__") + or hasattr(data, "__arrow_c_stream__") + ): + # explicitly test this, because otherwise we would accept variour other + # input types through the pa.table(..) call + raise TypeError( + "Expected an Arrow-compatible tabular object (i.e. having an " + "'_arrow_c_array__' or '__arrow_c_stream__' method), got " + f"'{type(data).__name__}' instead." + ) data = pa.table(data) df = data.to_pandas() diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index 7bef9d81ad5bb..9fd2e055421a1 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -87,3 +87,7 @@ def test_dataframe_from_arrow(): result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch)) tm.assert_frame_equal(result, expected) + + # only accept actual Arrow objects + with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"): + pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]}) From 6af237c623176068091672ec45a9dc0a9ec2ab5c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 18:40:01 +0200 Subject: [PATCH 3/4] add return type --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 241de5cfbde76..5f9cc1e0286dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1747,7 +1747,7 @@ def __rmatmul__(self, other) -> DataFrame: # IO methods (to / from other formats) @classmethod - def from_arrow(cls, data): + def from_arrow(cls, data) -> DataFrame: """ Construct a DataFrame from a tabular Arrow object. From fad6bb12b4d7cfbd9ac62f727085a06571280f7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Sep 2024 10:19:58 +0200 Subject: [PATCH 4/4] add type hints and protocol definitions --- pandas/_typing.py | 39 +++++++++++++++++++++++++++++++++++++++ pandas/core/frame.py | 6 +++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index c1769126a5776..96b4d2bba8291 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -528,3 +528,42 @@ def closed(self) -> bool: SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) SliceType = Optional[Hashable] + + +# Arrow PyCapsule Interface +# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints + + +class ArrowArrayExportable(Protocol): + """ + An object with an ``__arrow_c_array__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in + Python), enabling zero-copy Arrow data interchange across libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html + + """ + + def __arrow_c_array__( + self, requested_schema: object | None = None + ) -> tuple[object, object]: ... + + +class ArrowStreamExportable(Protocol): + """ + An object with an ``__arrow_c_stream__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ + for streams in Python), enabling zero-copy Arrow data interchange across + libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html + + """ + + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f9cc1e0286dd..ea39b91aa007f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,6 +205,8 @@ AnyAll, AnyArrayLike, ArrayLike, + ArrowArrayExportable, + ArrowStreamExportable, Axes, Axis, AxisInt, @@ -1747,7 +1749,9 @@ def __rmatmul__(self, other) -> DataFrame: # IO methods (to / from other formats) @classmethod - def from_arrow(cls, data) -> DataFrame: + def from_arrow( + cls, data: ArrowArrayExportable | ArrowStreamExportable + ) -> DataFrame: """ Construct a DataFrame from a tabular Arrow object.