Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the virtualfile_in function to accept more 1-D arrays #2744

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
66c4b97
Refactor the data_kind and the virtualfile_to_data functions
seisman Oct 13, 2023
78c28cd
Update more functions
seisman Oct 14, 2023
f849e5a
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 15, 2023
f37413b
Change ncols to names
seisman Oct 15, 2023
3de7666
Fix more tests
seisman Oct 15, 2023
93b91d0
Fix project
seisman Oct 15, 2023
2eecf48
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 16, 2023
1d6e568
Fix more tests
seisman Oct 16, 2023
6f9fc19
Fixes
seisman Oct 16, 2023
68034ed
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 17, 2023
0db21bc
Fix triangulate
seisman Oct 17, 2023
7cf5290
Fix text
seisman Oct 17, 2023
b0b6d2a
Fix more failing tests
seisman Oct 17, 2023
fa875ef
More fixes
seisman Oct 17, 2023
2ee0df2
Fix linting issues
seisman Oct 17, 2023
d5c8340
Fix linting issues
seisman Oct 17, 2023
30bacb1
Fix linting issues
seisman Oct 18, 2023
4465f9b
Merge branch 'main' into refactor/virtualfile-to-data
seisman Oct 20, 2023
593f252
Update pygmt/clib/session.py
seisman Oct 20, 2023
409337f
Apply suggestions from code review
seisman Oct 25, 2023
872fd59
Merge branch 'main' into refactor/virtualfile-to-data
seisman Dec 25, 2023
3ed0eb2
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jan 16, 2024
efa7a11
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jan 18, 2024
23fc3ea
Merge branch 'main' into refactor/virtualfile-to-data
seisman Mar 1, 2024
aa05333
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 11, 2024
5c10fc4
Fix plot and plot3d
seisman Jul 11, 2024
525a353
Fix errors in merging the main branch
seisman Jul 11, 2024
2f3fcc4
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 20, 2024
b55a9ad
Fix merging issue
seisman Jul 20, 2024
46be0fa
Merge branch 'main' into refactor/virtualfile-to-data
seisman Jul 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 16 additions & 19 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
fmt_docstring,
tempfile_from_geojson,
tempfile_from_image,
validate_data_input,
)

FAMILIES = [
Expand Down Expand Up @@ -1474,11 +1475,8 @@ def virtualfile_from_data(
self,
check_kind=None,
data=None,
x=None,
y=None,
z=None,
extra_arrays=None,
required_z=False,
vectors=None,
ncols=2,
required_data=True,
):
"""
Expand All @@ -1497,13 +1495,11 @@ def virtualfile_from_data(
Any raster or vector data format. This could be a file name or
path, a raster grid, a vector matrix/arrays, or other supported
data input.
x/y/z : 1-D arrays or None
x, y, and z columns as numpy arrays.
extra_arrays : list of 1-D arrays
Optional. A list of numpy arrays in addition to x, y, and z.
All of these arrays must be of the same size as the x/y/z arrays.
required_z : bool
State whether the 'z' column is required.
vectors : list of 1-D arrays or None
A list of 1-D arrays. Each array will be a column in the table.
All of these arrays must be of the same size.
ncols : int
The minimum number of columns required for the data.
required_data : bool
Set to True when 'data' is required, or False when dealing with
optional virtual files. [Default is True].
Expand Down Expand Up @@ -1537,8 +1533,13 @@ def virtualfile_from_data(
...
<vector memory>: N = 3 <7/9> <4/6> <1/3>
"""
kind = data_kind(
data, x, y, z, required_z=required_z, required_data=required_data
kind = data_kind(data, required=required_data)
validate_data_input(
data=data,
vectors=vectors,
ncols=ncols,
required_data=required_data,
kind=kind,
)

if check_kind:
Expand Down Expand Up @@ -1579,11 +1580,7 @@ def virtualfile_from_data(
warnings.warn(message=msg, category=RuntimeWarning, stacklevel=2)
_data = (data,) if not isinstance(data, pathlib.PurePath) else (str(data),)
elif kind == "vectors":
_data = [np.atleast_1d(x), np.atleast_1d(y)]
if z is not None:
_data.append(np.atleast_1d(z))
if extra_arrays:
_data.extend(extra_arrays)
_data = [np.atleast_1d(v) for v in vectors]
elif kind == "matrix": # turn 2-D arrays into list of vectors
try:
# pandas.Series will be handled below like a 1-D numpy.ndarray
Expand Down
1 change: 1 addition & 0 deletions pygmt/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
is_nonstr_iter,
launch_external_viewer,
non_ascii_to_octal,
validate_data_input,
)
166 changes: 81 additions & 85 deletions pygmt/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,127 +15,133 @@
from pygmt.exceptions import GMTInvalidInput


def _validate_data_input(
data=None, x=None, y=None, z=None, required_z=False, required_data=True, kind=None
def validate_data_input(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's more useful to pass the list of column names instead, i.e., replacing ncols=2 with names=["x", "y"].

So, for most modules, vectors=["x", "y"] and names=["x", "y"] or vectors=[x, y, z] and names=["x", "y", "z"].

For more complicated modules like plot or plot3d, the names can be
names=["x", "y", "direction_arg1", "direction_arg2", "fill", "size", "symbol", "transparency"].

The column names will be very useful when the GMTInvalidInput exception is raised.
For example, instead of "Column 5 can't be None.", we can say "Column 5 ('size') can't be None.". Instead of "data must have at least 8 columns.", we can say

data must have at least 8 columns:
x y direction_arg1 direction_arg2 fill size symbol transparency

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in f37413b

seisman marked this conversation as resolved.
Show resolved Hide resolved
data=None, vectors=None, ncols=2, required_data=True, kind=None
):
"""
Check if the combination of data/x/y/z is valid.
Check if the data input is valid.

Examples
--------
>>> _validate_data_input(data="infile")
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6])
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], z=[7, 8, 9])
>>> _validate_data_input(data=None, required_data=False)
>>> _validate_data_input()
>>> validate_data_input(data="infile")
>>> validate_data_input(vectors=[[1, 2, 3], [4, 5, 6]], ncols=2)
>>> validate_data_input(vectors=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], ncols=3)
>>> validate_data_input(data=None, required_data=False)
>>> validate_data_input()
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: No input data provided.
>>> _validate_data_input(x=[1, 2, 3])
>>> validate_data_input(vectors=[[1, 2, 3], None], ncols=2)
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(y=[4, 5, 6])
pygmt.exceptions.GMTInvalidInput: The 'y' column can't be None.
>>> validate_data_input(vectors=[None, [4, 5, 6]], ncols=2)
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], required_z=True)
pygmt.exceptions.GMTInvalidInput: The 'x' column can't be None.
>>> validate_data_input(vectors=[[1, 2, 3], [4, 5, 6], None], ncols=3)
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide x, y, and z.
pygmt.exceptions.GMTInvalidInput: The 'z' column can't be None.
>>> import numpy as np
>>> import pandas as pd
>>> import xarray as xr
>>> data = np.arange(8).reshape((4, 2))
>>> _validate_data_input(data=data, required_z=True, kind="matrix")
>>> validate_data_input(data=data, ncols=3, kind="matrix")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
>>> validate_data_input(
... data=pd.DataFrame(data, columns=["x", "y"]),
... required_z=True,
... ncols=3,
... kind="matrix",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
>>> validate_data_input(
... data=xr.Dataset(pd.DataFrame(data, columns=["x", "y"])),
... required_z=True,
... ncols=3,
... kind="matrix",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(data="infile", x=[1, 2, 3])
pygmt.exceptions.GMTInvalidInput: data must have at least 3 columns.
>>> validate_data_input(data="infile", vectors=[[1, 2, 3], None])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", y=[4, 5, 6])
pygmt.exceptions.GMTInvalidInput: Too much data. Pass in either 'data' or 1-D arrays. # noqa: W505
>>> validate_data_input(data="infile", vectors=[None, [4, 5, 6]])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", z=[7, 8, 9])
pygmt.exceptions.GMTInvalidInput: Too much data. Pass in either 'data' or 1-D arrays. # noqa: W505
>>> validate_data_input(data="infile", vectors=[None, None, [7, 8, 9]])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
pygmt.exceptions.GMTInvalidInput: Too much data. Pass in either 'data' or 1-D arrays. # noqa: W505

Raises
------
GMTInvalidInput
If the data input is not valid.
"""
if data is None: # data is None
if x is None and y is None: # both x and y are None
if required_data: # data is not optional
raise GMTInvalidInput("No input data provided.")
elif x is None or y is None: # either x or y is None
raise GMTInvalidInput("Must provide both x and y.")
if required_z and z is None: # both x and y are not None, now check z
raise GMTInvalidInput("Must provide x, y, and z.")
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
# For 'matrix' kind, check if data has the required z column
if kind == "matrix" and required_z:
if kind is None:
kind = data_kind(data=data, required=required_data)

if kind == "vectors": # From data_kind, we know that data is None
if vectors is None:
raise GMTInvalidInput("No input data provided.")
if len(vectors) < ncols:
raise GMTInvalidInput(
f"Requires {ncols} 1-D arrays but got {len(vectors)}."
)
for i, v in enumerate(vectors[:ncols]):
if v is None:
if i < 3:
msg = f"The '{'xyz'[i]}' column can't be None."
else:
msg = "Column {i} can't be None."
raise GMTInvalidInput(msg)
else:
if vectors is not None and any(v is not None for v in vectors):
raise GMTInvalidInput("Too much data. Pass in either 'data' or 1-D arrays.")
if kind == "matrix": # check number of columns for matrix-like data
if hasattr(data, "shape"): # np.ndarray or pd.DataFrame
if len(data.shape) == 1 and data.shape[0] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if len(data.shape) > 1 and data.shape[1] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if hasattr(data, "data_vars") and len(data.data_vars) < 3: # xr.Dataset
raise GMTInvalidInput("data must provide x, y, and z columns.")
if len(data.shape) == 1 and data.shape[0] < ncols:
raise GMTInvalidInput(f"data must have at least {ncols} columns.")
if len(data.shape) > 1 and data.shape[1] < ncols:
raise GMTInvalidInput(f"data must have at least {ncols} columns.")
if hasattr(data, "data_vars") and len(data.data_vars) < ncols: # xr.Dataset
raise GMTInvalidInput(f"data must have at least {ncols} columns.")


def data_kind(data=None, x=None, y=None, z=None, required_z=False, required_data=True):
def data_kind(data=None, required=True):
"""
Check what kind of data is provided to a module.
Determine the kind of data that will be passed to a module.

Possible types:
It checks the type of the ``data`` argument and determines the kind of
data. Falls back to ``"vectors"`` if ``data`` is None but required.

* a file name provided as 'data'
* a pathlib.PurePath object provided as 'data'
* an xarray.DataArray object provided as 'data'
* a 2-D matrix provided as 'data'
* 1-D arrays x and y (and z, optionally)
* an optional argument (None, bool, int or float) provided as 'data'
Possible data kinds:

Arguments should be ``None`` if not used. If doesn't fit any of these
categories (or fits more than one), will raise an exception.
- ``'file'``: a file name or a pathlib.PurePath object providfed as 'data'
seisman marked this conversation as resolved.
Show resolved Hide resolved
- ``'arg'``: an optional argument (None, bool, int or float) provided
as 'data'
- ``'grid'``: an xarray.DataArray with 2 dimensions provided as 'data'
- ``'image'``: an xarray.DataArray with 3 dimensions provided as 'data'
- ``'geojson'``: a geo-like Python object that implements
``__geo_interface__`` (geopandas.GeoDataFrame or shapely.geometry)
provided as 'data'
- ``'matrix'``: a 2-D array provided as 'data'
- ``'vectors'``: a list of 1-D arrays provided as 'vectors'

Parameters
----------
data : str, pathlib.PurePath, None, bool, xarray.DataArray or {table-like}
Pass in either a file name or :class:`pathlib.Path` to an ASCII data
table, an :class:`xarray.DataArray`, a 1-D/2-D
{table-classes} or an option argument.
x/y : 1-D arrays or None
x and y columns as numpy arrays.
z : 1-D array or None
z column as numpy array. To be used optionally when x and y are given.
required_z : bool
State whether the 'z' column is required.
required_data : bool
required : bool
Set to True when 'data' is required, or False when dealing with
optional virtual files. [Default is True].

Expand All @@ -151,49 +157,39 @@ def data_kind(data=None, x=None, y=None, z=None, required_z=False, required_data
>>> import numpy as np
>>> import xarray as xr
>>> import pathlib
>>> data_kind(data=None, x=np.array([1, 2, 3]), y=np.array([4, 5, 6]))
>>> data_kind(data=None)
'vectors'
>>> data_kind(data=np.arange(10).reshape((5, 2)), x=None, y=None)
>>> data_kind(data=np.arange(10).reshape((5, 2)))
'matrix'
>>> data_kind(data="my-data-file.txt", x=None, y=None)
>>> data_kind(data="my-data-file.txt")
'file'
>>> data_kind(data=pathlib.Path("my-data-file.txt"), x=None, y=None)
>>> data_kind(data=pathlib.Path("my-data-file.txt"))
'file'
>>> data_kind(data=None, x=None, y=None, required_data=False)
>>> data_kind(data=None, required=False)
'arg'
>>> data_kind(data=2.0, x=None, y=None, required_data=False)
>>> data_kind(data=2.0, required=False)
'arg'
>>> data_kind(data=True, x=None, y=None, required_data=False)
>>> data_kind(data=True, required=False)
'arg'
>>> data_kind(data=xr.DataArray(np.random.rand(4, 3)))
'grid'
>>> data_kind(data=xr.DataArray(np.random.rand(3, 4, 5)))
'image'
"""
# determine the data kind
if isinstance(data, (str, pathlib.PurePath)):
kind = "file"
elif isinstance(data, (bool, int, float)) or (data is None and not required_data):
elif isinstance(data, (bool, int, float)) or (data is None and not required):
kind = "arg"
elif isinstance(data, xr.DataArray):
kind = "image" if len(data.dims) == 3 else "grid"
elif hasattr(data, "__geo_interface__"):
# geo-like Python object that implements ``__geo_interface__``
# (geopandas.GeoDataFrame or shapely.geometry)
kind = "geojson"
elif data is not None:
elif data is not None: # anything but None is taken as a matrix
kind = "matrix"
else:
else: # fallback to vectors if data is None but required
kind = "vectors"
_validate_data_input(
data=data,
x=x,
y=y,
z=z,
required_z=required_z,
required_data=required_data,
kind=kind,
)
return kind


Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/blockm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _blockm(block_method, data, x, y, z, outfile, **kwargs):
with GMTTempFile(suffix=".csv") as tmpfile:
with Session() as lib:
table_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], ncols=3
)
# Run blockm* on data table
with table_context as infile:
Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/contour.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def contour(self, data=None, x=None, y=None, z=None, **kwargs):

with Session() as lib:
file_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], ncols=3
)
with file_context as fname:
lib.call_module(
Expand Down
2 changes: 1 addition & 1 deletion pygmt/src/nearneighbor.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def nearneighbor(data=None, x=None, y=None, z=None, **kwargs):
with GMTTempFile(suffix=".nc") as tmpfile:
with Session() as lib:
table_context = lib.virtualfile_from_data(
check_kind="vector", data=data, x=x, y=y, z=z, required_z=True
check_kind="vector", data=data, vectors=[x, y, z], ncols=3
)
with table_context as infile:
if (outgrid := kwargs.get("G")) is None:
Expand Down
Loading
Loading