From de36c6c54992c4052d5e55d499287a35794b063a Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 29 Oct 2023 16:03:05 +1300 Subject: [PATCH 01/12] Ensure that pyarrow backed pandas.Series can be read Install pyarrow as an optional dependency, and check that pandas.Series objects backed by pyarrow dtypes (e.g. 'uint8[pyarrow]') can be read by virtualfile_from_vectors. --- .github/workflows/ci_tests.yaml | 2 +- pygmt/tests/test_clib.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci_tests.yaml b/.github/workflows/ci_tests.yaml index 3f6ebbd92cd..2b7452cff7e 100644 --- a/.github/workflows/ci_tests.yaml +++ b/.github/workflows/ci_tests.yaml @@ -71,7 +71,7 @@ jobs: optional-packages: '' - python-version: '3.11' numpy-version: '1.26' - optional-packages: ' contextily geopandas ipython rioxarray sphinx-gallery' + optional-packages: ' contextily geopandas ipython pyarrow rioxarray sphinx-gallery' timeout-minutes: 30 defaults: diff --git a/pygmt/tests/test_clib.py b/pygmt/tests/test_clib.py index 9f2c73857cd..83e85935b92 100644 --- a/pygmt/tests/test_clib.py +++ b/pygmt/tests/test_clib.py @@ -639,16 +639,24 @@ def test_virtualfile_from_matrix_slice(dtypes): def test_virtualfile_from_vectors_pandas(dtypes): """ - Pass vectors to a dataset using pandas Series. + Pass vectors to a dataset using pandas Series, checking both numpy and + pyarrow dtypes. """ size = 13 + try: + pd.ArrowDtype(pyarrow_dtype="bool") # check is pyarrow is installed + + dtypes.extend([f"{dtype}[pyarrow]" for dtype in dtypes]) + except ImportError: + pass for dtype in dtypes: data = pd.DataFrame( data={ - "x": np.arange(size, dtype=dtype), - "y": np.arange(size, size * 2, 1, dtype=dtype), - "z": np.arange(size * 2, size * 3, 1, dtype=dtype), - } + "x": np.arange(size), + "y": np.arange(size, size * 2, 1), + "z": np.arange(size * 2, size * 3, 1), + }, + dtype=dtype, ) with clib.Session() as lib: with lib.virtualfile_from_vectors(data.x, data.y, data.z) as vfile: From 06739efa1d069ae6f9319b133357fd5896750d45 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:11:15 +1300 Subject: [PATCH 02/12] Fix check on whether pyarrow is installed Remove duck-typing that didn't work, and use a proper try-except `import pyarrow`. --- pygmt/tests/test_clib_virtualfiles.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pygmt/tests/test_clib_virtualfiles.py b/pygmt/tests/test_clib_virtualfiles.py index c77c7c557e1..ca978f01c18 100644 --- a/pygmt/tests/test_clib_virtualfiles.py +++ b/pygmt/tests/test_clib_virtualfiles.py @@ -13,6 +13,11 @@ from pygmt.helpers import GMTTempFile from pygmt.tests.test_clib import mock +try: + import pyarrow as pa +except ImportError: + pa = None + TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt") @@ -325,12 +330,9 @@ def test_virtualfile_from_vectors_pandas(dtypes): pyarrow dtypes. """ size = 13 - try: - pd.ArrowDtype(pyarrow_dtype="bool") # check is pyarrow is installed - + if pa is not None: dtypes.extend([f"{dtype}[pyarrow]" for dtype in dtypes]) - except ImportError: - pass + for dtype in dtypes: data = pd.DataFrame( data={ From 83ed44771ecec6087965259f0c85a42509202ad4 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 17:43:33 +1300 Subject: [PATCH 03/12] Ensure that pygmt.info can work with pyarrow int64/float64 dtypes Check that pandas.Series and pandas.DataFrame objects backed by pyarrow dtypes (e.g. 'int64[pyarrow]' and 'float64[pyarrow]') can be read by pygmt.info. Using pandas.util._test_decorators_skip_if_no pytest mark to simplify the pytest parametrize. --- pygmt/tests/test_info.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pygmt/tests/test_info.py b/pygmt/tests/test_info.py index 999965417a3..25f8cb142f1 100644 --- a/pygmt/tests/test_info.py +++ b/pygmt/tests/test_info.py @@ -8,6 +8,7 @@ import numpy as np import numpy.testing as npt import pandas as pd +import pandas.util._test_decorators as td import pytest import xarray as xr from pygmt import info @@ -74,16 +75,30 @@ def test_info_2d_list(): assert output == expected_output -def test_info_series(): +@pytest.mark.parametrize( + "dtype", + [ + "int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no(package="pyarrow")), + ], +) +def test_info_series(dtype): """ Make sure info works on a pandas.Series input. """ - output = info(pd.Series(data=[0, 4, 2, 8, 6])) + output = info(pd.Series(data=[0, 4, 2, 8, 6], dtype=dtype)) expected_output = ": N = 5 <0/8>\n" assert output == expected_output -def test_info_dataframe(): +@pytest.mark.parametrize( + "dtype", + [ + "float64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no(package="pyarrow")), + ], +) +def test_info_dataframe(dtype): """ Make sure info works on pandas.DataFrame inputs. """ From ad99cabb2fc85e379f6c656e129823066492e28f Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 18:16:19 +1300 Subject: [PATCH 04/12] Add xfail test for test_geopandas_plot_int_dtypes casting to pyarrow int Geopandas doesn't support casting to pyarrow dtypes like 'int32[pyarrow]' and 'int64[pyarrow]' yet, but adding an xfail test so that we don't forget to test in the future. --- pygmt/tests/test_geopandas.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pygmt/tests/test_geopandas.py b/pygmt/tests/test_geopandas.py index adc7dd7f99c..e6aeb3bcc62 100644 --- a/pygmt/tests/test_geopandas.py +++ b/pygmt/tests/test_geopandas.py @@ -3,6 +3,7 @@ """ import numpy.testing as npt import pandas as pd +import pandas.util._test_decorators as td import pytest from pygmt import Figure, info, makecpt, which @@ -141,6 +142,24 @@ def test_geopandas_plot3d_non_default_circle(): "int64", pd.Int32Dtype(), pd.Int64Dtype(), + pytest.param( + "int32[pyarrow]", + marks=[ + td.skip_if_no(package="pyarrow"), + pytest.mark.xfail( + reason="geopandas doesn't support casting to pyarrow dtypes yet." + ), + ], + ), + pytest.param( + "int64[pyarrow]", + marks=[ + td.skip_if_no(package="pyarrow"), + pytest.mark.xfail( + reason="geopandas doesn't support casting to pyarrow dtypes yet." + ), + ], + ), ], ) @pytest.mark.mpl_image_compare(filename="test_geopandas_plot_int_dtypes.png") From a5284fc011b70fb60b0702d37cc3e2ff71fe4f15 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 18:32:11 +1300 Subject: [PATCH 05/12] Clarify reason for test_geopandas_plot_int_dtypes xfail Actually, casting to pyarrow integer dtypes work, but writing to the temporary OGR_GMT file doesn't. --- pygmt/tests/test_geopandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygmt/tests/test_geopandas.py b/pygmt/tests/test_geopandas.py index e6aeb3bcc62..6d56de9a2fd 100644 --- a/pygmt/tests/test_geopandas.py +++ b/pygmt/tests/test_geopandas.py @@ -147,7 +147,7 @@ def test_geopandas_plot3d_non_default_circle(): marks=[ td.skip_if_no(package="pyarrow"), pytest.mark.xfail( - reason="geopandas doesn't support casting to pyarrow dtypes yet." + reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet." ), ], ), @@ -156,7 +156,7 @@ def test_geopandas_plot3d_non_default_circle(): marks=[ td.skip_if_no(package="pyarrow"), pytest.mark.xfail( - reason="geopandas doesn't support casting to pyarrow dtypes yet." + reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet." ), ], ), From 8b28ba46a29f06290ae7e55e569e2b9eb0f6c4b5 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 18:42:41 +1300 Subject: [PATCH 06/12] Add optional pyarrow dependency to ci_test_dev and ci_tests_legacy Ensure that previous and future versions of GMT are compatible with PyArrow too. --- .github/workflows/ci_tests_dev.yaml | 2 +- .github/workflows/ci_tests_legacy.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci_tests_dev.yaml b/.github/workflows/ci_tests_dev.yaml index 20f382799d3..60178dcc375 100644 --- a/.github/workflows/ci_tests_dev.yaml +++ b/.github/workflows/ci_tests_dev.yaml @@ -153,7 +153,7 @@ jobs: python -m pip install --pre --prefer-binary \ --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ numpy pandas xarray netCDF4 packaging \ - build contextily dvc geopandas ipython rioxarray \ + build contextily dvc geopandas ipython pyarrow rioxarray \ 'pytest>=6.0' pytest-cov pytest-doctestplus pytest-mpl \ sphinx-gallery diff --git a/.github/workflows/ci_tests_legacy.yaml b/.github/workflows/ci_tests_legacy.yaml index 70d268cb083..9f579832e39 100644 --- a/.github/workflows/ci_tests_legacy.yaml +++ b/.github/workflows/ci_tests_legacy.yaml @@ -72,6 +72,7 @@ jobs: contextily geopandas ipython + pyarrow rioxarray sphinx-gallery build From 95d2b8a3d59931dc9d29a9789ef4efc4c7050c5c Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 19:04:23 +1300 Subject: [PATCH 07/12] Add note about support of PyArrow dtypes to doc/install.rst Mention that PyGMT does have some initial support of Pandas objects backed by PyArrow-dtype arrays, but only uint/int/float dtypes for now. --- doc/install.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/install.rst b/doc/install.rst index f55eb9ac4e1..f488a9deb90 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -107,6 +107,15 @@ The following are optional dependencies: * `GeoPandas `__: For using and plotting GeoDataFrame objects. * `RioXarray `__: For saving multi-band rasters to GeoTIFFs. +.. note:: + + If you have `PyArrow `__ + installed, PyGMT does have some initial support for ``pandas.Series`` and + ``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically, + only uint/int/float dtypes are supported for now. Support for datetime and + string Arrow dtypes are still works in progress. For more details, see + https://github.com/GenericMappingTools/pygmt/issues/2800. + Installing GMT and other dependencies ------------------------------------- From 5adcba6e1c6f16304a0460e2c66af64a293c0dbf Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 11 Dec 2023 11:39:19 +1300 Subject: [PATCH 08/12] Replace pandas.util._test_decorators with inline pytest.mark.skipif mark Implementing the td.skip_if_no pytest mark inline instead of using the private function from pandas. --- pygmt/tests/test_info.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pygmt/tests/test_info.py b/pygmt/tests/test_info.py index 25f8cb142f1..318a7051325 100644 --- a/pygmt/tests/test_info.py +++ b/pygmt/tests/test_info.py @@ -8,12 +8,18 @@ import numpy as np import numpy.testing as npt import pandas as pd -import pandas.util._test_decorators as td import pytest import xarray as xr from pygmt import info from pygmt.exceptions import GMTInvalidInput +try: + import pyarrow # noqa: F401 + + HAS_PYARROW = True +except ImportError: + HAS_PYARROW = False + TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt") @@ -79,7 +85,12 @@ def test_info_2d_list(): "dtype", [ "int64", - pytest.param("int64[pyarrow]", marks=td.skip_if_no(package="pyarrow")), + pytest.param( + "int64[pyarrow]", + marks=pytest.mark.skipif( + condition=not HAS_PYARROW, reason="Could not import 'pyarrow'" + ), + ), ], ) def test_info_series(dtype): @@ -95,7 +106,12 @@ def test_info_series(dtype): "dtype", [ "float64", - pytest.param("float64[pyarrow]", marks=td.skip_if_no(package="pyarrow")), + pytest.param( + "float64[pyarrow]", + marks=pytest.mark.skipif( + condition=not HAS_PYARROW, reason="Could not import 'pyarrow'" + ), + ), ], ) def test_info_dataframe(dtype): From 6d50a23d3e9caf6d0bfd4f929b6fef9b9dcd32cf Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 14:51:06 +1300 Subject: [PATCH 09/12] Apply suggestions from code review Co-Authored-By: Dongdong Tian --- doc/install.rst | 2 +- pygmt/tests/test_clib_virtualfiles.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/install.rst b/doc/install.rst index c1695a8311b..00d3989c054 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -114,7 +114,7 @@ The following are optional dependencies: ``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically, only uint/int/float dtypes are supported for now. Support for datetime and string Arrow dtypes are still works in progress. For more details, see - https://github.com/GenericMappingTools/pygmt/issues/2800. + `issue #2800 `__. Installing GMT and other dependencies ------------------------------------- diff --git a/pygmt/tests/test_clib_virtualfiles.py b/pygmt/tests/test_clib_virtualfiles.py index ca978f01c18..8140585df0e 100644 --- a/pygmt/tests/test_clib_virtualfiles.py +++ b/pygmt/tests/test_clib_virtualfiles.py @@ -326,7 +326,7 @@ def test_virtualfile_from_matrix_slice(dtypes): def test_virtualfile_from_vectors_pandas(dtypes): """ - Pass vectors to a dataset using pandas Series, checking both numpy and + Pass vectors to a dataset using pandas.Series, checking both numpy and pyarrow dtypes. """ size = 13 From 62d22cbbd52040df501e66b67ca21264a34b1d6a Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 16:56:04 +1300 Subject: [PATCH 10/12] Refactor to use pygmt.helpers.testing.skip_if_no --- pygmt/tests/test_geopandas.py | 6 +++--- pygmt/tests/test_info.py | 25 +++---------------------- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/pygmt/tests/test_geopandas.py b/pygmt/tests/test_geopandas.py index bed52cbea00..74d13cbd7fc 100644 --- a/pygmt/tests/test_geopandas.py +++ b/pygmt/tests/test_geopandas.py @@ -3,9 +3,9 @@ """ import numpy.testing as npt import pandas as pd -import pandas.util._test_decorators as td import pytest from pygmt import Figure, info, makecpt, which +from pygmt.helpers.testing import skip_if_no gpd = pytest.importorskip("geopandas") shapely = pytest.importorskip("shapely") @@ -165,7 +165,7 @@ def test_geopandas_plot3d_non_default_circle(): pytest.param( "int32[pyarrow]", marks=[ - td.skip_if_no(package="pyarrow"), + skip_if_no(package="pyarrow"), pytest.mark.xfail( reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet." ), @@ -174,7 +174,7 @@ def test_geopandas_plot3d_non_default_circle(): pytest.param( "int64[pyarrow]", marks=[ - td.skip_if_no(package="pyarrow"), + skip_if_no(package="pyarrow"), pytest.mark.xfail( reason="geopandas doesn't support writing columns with pyarrow dtypes to OGR_GMT yet." ), diff --git a/pygmt/tests/test_info.py b/pygmt/tests/test_info.py index 318a7051325..b1c247e5bd3 100644 --- a/pygmt/tests/test_info.py +++ b/pygmt/tests/test_info.py @@ -12,13 +12,7 @@ import xarray as xr from pygmt import info from pygmt.exceptions import GMTInvalidInput - -try: - import pyarrow # noqa: F401 - - HAS_PYARROW = True -except ImportError: - HAS_PYARROW = False +from pygmt.helpers.testing import skip_if_no TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt") @@ -83,15 +77,7 @@ def test_info_2d_list(): @pytest.mark.parametrize( "dtype", - [ - "int64", - pytest.param( - "int64[pyarrow]", - marks=pytest.mark.skipif( - condition=not HAS_PYARROW, reason="Could not import 'pyarrow'" - ), - ), - ], + ["int64", pytest.param("int64[pyarrow]", marks=skip_if_no(package="pyarrow"))], ) def test_info_series(dtype): """ @@ -106,12 +92,7 @@ def test_info_series(dtype): "dtype", [ "float64", - pytest.param( - "float64[pyarrow]", - marks=pytest.mark.skipif( - condition=not HAS_PYARROW, reason="Could not import 'pyarrow'" - ), - ), + pytest.param("float64[pyarrow]", marks=skip_if_no(package="pyarrow")), ], ) def test_info_dataframe(dtype): From a27582544c07b8d653cde206645a34c1d1f545e1 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 16:59:08 +1300 Subject: [PATCH 11/12] Use importlib.util.find_spec instead of try-except block Cleaner way to check if pyarrow is installed or not. --- pygmt/tests/test_clib_virtualfiles.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pygmt/tests/test_clib_virtualfiles.py b/pygmt/tests/test_clib_virtualfiles.py index 8140585df0e..133f154d081 100644 --- a/pygmt/tests/test_clib_virtualfiles.py +++ b/pygmt/tests/test_clib_virtualfiles.py @@ -2,6 +2,7 @@ Test the C API functions related to virtual files. """ import os +from importlib.util import find_spec from itertools import product import numpy as np @@ -13,11 +14,6 @@ from pygmt.helpers import GMTTempFile from pygmt.tests.test_clib import mock -try: - import pyarrow as pa -except ImportError: - pa = None - TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt") @@ -330,7 +326,7 @@ def test_virtualfile_from_vectors_pandas(dtypes): pyarrow dtypes. """ size = 13 - if pa is not None: + if find_spec("pyarrow") is not None: dtypes.extend([f"{dtype}[pyarrow]" for dtype in dtypes]) for dtype in dtypes: From 71946f81be081c6be9d3fb7cf9327202f8b5eef4 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 19:04:47 +1300 Subject: [PATCH 12/12] Update doc/install.rst Co-authored-by: Dongdong Tian --- doc/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/install.rst b/doc/install.rst index 00d3989c054..f3594e52521 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -113,7 +113,7 @@ The following are optional dependencies: installed, PyGMT does have some initial support for ``pandas.Series`` and ``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically, only uint/int/float dtypes are supported for now. Support for datetime and - string Arrow dtypes are still works in progress. For more details, see + string Arrow dtypes are still working in progress. For more details, see `issue #2800 `__. Installing GMT and other dependencies