Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(geopandas support): return GeoDataFrame if geopandas is installed #143

Merged
merged 3 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions dataretrieval/nldi.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

NLDI_API_BASE_URL = 'https://labs.waterdata.usgs.gov/api/nldi/linked-data'
_AVAILABLE_DATA_SOURCES = None
_CRS = "EPSG:4326"


def _query_nldi(url, query_params, error_message):
Expand Down Expand Up @@ -101,7 +102,7 @@ def get_flowlines(
feature_collection = _query_nldi(url, query_params, err_msg)
if as_json:
return feature_collection
gdf = gpd.GeoDataFrame.from_features(feature_collection)
gdf = gpd.GeoDataFrame.from_features(feature_collection, crs=_CRS)
return gdf


Expand Down Expand Up @@ -154,7 +155,7 @@ def get_basin(
feature_collection = _query_nldi(url, query_params, err_msg)
if as_json:
return feature_collection
gdf = gpd.GeoDataFrame.from_features(feature_collection)
gdf = gpd.GeoDataFrame.from_features(feature_collection, crs=_CRS)
return gdf


Expand Down Expand Up @@ -291,7 +292,7 @@ def get_features(
feature_collection = _query_nldi(url, query_params, err_msg)
if as_json:
return feature_collection
gdf = gpd.GeoDataFrame.from_features(feature_collection)
gdf = gpd.GeoDataFrame.from_features(feature_collection, crs=_CRS)
return gdf


Expand Down Expand Up @@ -322,7 +323,7 @@ def get_features_by_data_source(data_source: str) -> gpd.GeoDataFrame:
url = f'{NLDI_API_BASE_URL}/{data_source}'
err_msg = f"Error getting features for data source '{data_source}'"
feature_collection = _query_nldi(url, {}, err_msg)
gdf = gpd.GeoDataFrame.from_features(feature_collection)
gdf = gpd.GeoDataFrame.from_features(feature_collection, crs=_CRS)
return gdf


Expand Down
14 changes: 14 additions & 0 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@

from .utils import query

try:
import geopandas as gpd
except ImportError:
gpd = None

WATERDATA_BASE_URL = 'https://nwis.waterdata.usgs.gov/'
WATERDATA_URL = WATERDATA_BASE_URL + 'nwis/'
WATERSERVICE_URL = 'https://waterservices.usgs.gov/nwis/'
Expand All @@ -38,6 +43,7 @@
'water_use',
'ratings',
]
_CRS = "EPSG:4236"


def format_response(
Expand Down Expand Up @@ -71,6 +77,14 @@ def format_response(
if service == 'peaks':
df = preformat_peaks_response(df)

if gpd is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this remove the original lat/lon column?
If so (and perhaps either way), we should make this conversion optional, so that we don't break existing codes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This maintains the original lat/long columns. This PR just builds a geometry field from that data and creates a geodataframe. I can add a unit test that checks the return type

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated tests for geopandas support

if "dec_lat_va" in list(df):
geoms = gpd.points_from_xy(
df.dec_long_va.values,
df.dec_lat_va.values
)
df = gpd.GeoDataFrame(df, geometry=geoms, crs=_CRS)

# check for multiple sites:
if 'datetime' not in df.columns:
# XXX: consider making site_no index
Expand Down
123 changes: 98 additions & 25 deletions tests/waterservices_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
)
from dataretrieval.utils import NoSitesError

try:
import geopandas as gpd
except ImportError:
gpd = None

def test_query_waterdata_validation():
"""Tests the validation parameters of the query_waterservices method"""
Expand Down Expand Up @@ -80,7 +84,10 @@ def test_get_dv(requests_mock):
response_file_path = 'data/waterservices_dv.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_dv(sites=["01491000", "01645000"], start='2020-02-14', end='2020-02-15')
assert type(df) is DataFrame

if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 8
assert_metadata(requests_mock, request_url, md, site, None, format)

Expand All @@ -99,7 +106,9 @@ def test_get_dv_site_value_types(requests_mock, site_input_type_list):
else:
sites = site
df, md = get_dv(sites=sites, start='2020-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 8


Expand All @@ -112,7 +121,9 @@ def test_get_iv(requests_mock):
response_file_path = 'data/waterservices_iv.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_iv(sites=["01491000", "01645000"], start='2019-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 563380
assert md.url == request_url
assert_metadata(requests_mock, request_url, md, site, None, format)
Expand All @@ -132,7 +143,8 @@ def test_get_iv_site_value_types(requests_mock, site_input_type_list):
else:
sites = site
df, md = get_iv(sites=sites, start='2019-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 563380
assert md.url == request_url

Expand All @@ -142,15 +154,26 @@ def test_get_info(requests_mock):
Tests get_info method correctly generates the request url and returns the result in a DataFrame.
Note that only sites and format are passed as query params
"""
size = 24
format = "rdb"
site = '01491000%2C01645000'
parameter_cd = "00618"
request_url = 'https://waterservices.usgs.gov/nwis/site?sites={}&parameterCd={}&siteOutput=Expanded&format={}'.format(site, parameter_cd, format)
response_file_path = 'data/waterservices_site.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_info(sites=["01491000", "01645000"], parameterCd="00618")
assert type(df) is DataFrame
assert df.size == 24
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

if "geometry" in list(df):
geom_type = df.geom_type.unique()
if len(geom_type) > 1 or geom_type[0] != "Point":
raise AssertionError(
f"Geometry type {geom_type} not valid, expecting Point"
)
size += len(df)

assert df.size == size
assert md.url == request_url
assert_metadata(requests_mock, request_url, md, site, [parameter_cd], format)

Expand All @@ -167,7 +190,19 @@ def test_get_qwdata(requests_mock):
mock_request(requests_mock, request_url, response_file_path)
with pytest.warns(DeprecationWarning):
df, md = get_qwdata(sites=["01491000", "01645000"])
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

if "geometry" in list(df):
if not isinstance(df, gpd.GeoDataFrame):
raise AssertionError(f"{type(df)} is not a GeoDataFrame")

geom_type = df.geom_type.unique()
if len(geom_type) > 1 or geom_type[0] != "Point":
raise AssertionError(
f"Geometry type {geom_type} not valid, expecting Point"
)

assert df.size == 1821472
assert_metadata(requests_mock, request_url, md, site, None, format)

Expand Down Expand Up @@ -202,7 +237,9 @@ def test_get_gwlevels(requests_mock):
response_file_path = 'data/waterservices_gwlevels.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_gwlevels(sites=[site])
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 16
assert_metadata(requests_mock, request_url, md, site, None, format)

Expand All @@ -221,7 +258,8 @@ def test_get_gwlevels_site_value_types(requests_mock, site_input_type_list):
else:
sites = site
df, md = get_gwlevels(sites=sites)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 16


Expand All @@ -234,7 +272,9 @@ def test_get_discharge_peaks(requests_mock):
response_file_path = 'data/waterservices_peaks.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_discharge_peaks(sites=[site], start='2000-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 240
assert_metadata(requests_mock, request_url, md, site, None, format)

Expand All @@ -255,7 +295,9 @@ def test_get_discharge_peaks_sites_value_types(requests_mock, site_input_type_li
sites = site

df, md = get_discharge_peaks(sites=sites, start='2000-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 240


Expand All @@ -269,7 +311,9 @@ def test_get_discharge_measurements(requests_mock):
response_file_path = 'data/waterdata_measurements.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_discharge_measurements(sites=[site], start='2000-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 2130
assert_metadata(requests_mock, request_url, md, site, None, format)

Expand All @@ -288,7 +332,8 @@ def test_get_discharge_measurements_sites_value_types(requests_mock, site_input_
else:
sites = site
df, md = get_discharge_measurements(sites=sites, start='2000-02-14', end='2020-02-15')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 2130


Expand All @@ -300,7 +345,8 @@ def test_get_pmcodes(requests_mock):
response_file_path = 'data/waterdata_pmcodes.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_pmcodes(parameterCd='00618')
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 13
assert_metadata(requests_mock, request_url, md, None, None, format)

Expand All @@ -319,7 +365,8 @@ def test_get_pmcodes_parameterCd_value_types(requests_mock, parameterCd_input_ty
else:
parameterCd = parameterCd
df, md = get_pmcodes(parameterCd=parameterCd)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 13


Expand All @@ -332,7 +379,8 @@ def test_get_water_use_national(requests_mock):
response_file_path = 'data/water_use_national.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_water_use()
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 225
assert_metadata(requests_mock, request_url, md, None, None, format)

Expand Down Expand Up @@ -369,7 +417,8 @@ def test_get_water_use_national_county_value_types(requests_mock, county_input_t
else:
counties = county
df, md = get_water_use(counties=counties)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 225


Expand All @@ -387,7 +436,8 @@ def test_get_water_use_national_county_value_types(requests_mock, category_input
else:
categories = category
df, md = get_water_use(categories=categories)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 225


Expand All @@ -400,7 +450,8 @@ def test_get_water_use_allegheny(requests_mock):
response_file_path = 'data/water_use_allegheny.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_water_use(state="PA", counties="003")
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 1981
assert_metadata(requests_mock, request_url, md, None, None, format)

Expand All @@ -421,13 +472,16 @@ def test_get_ratings(requests_mock):
response_file_path = 'data/waterservices_ratings.txt'
mock_request(requests_mock, request_url, response_file_path)
df, md = get_ratings(site_no=site)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

assert df.size == 33
assert_metadata(requests_mock, request_url, md, site, None, format)


def test_what_sites(requests_mock):
"""Tests what_sites method correctly generates the request url and returns the result in a DataFrame"""
size = 2472
format = "rdb"
parameter_cd = '00010%2C00060'
parameter_cd_list = ["00010","00060"]
Expand All @@ -437,8 +491,22 @@ def test_what_sites(requests_mock):
mock_request(requests_mock, request_url, response_file_path)

df, md = what_sites(bBox=[-83.0,36.5,-81.0,38.5], parameterCd=parameter_cd_list, hasDataTypeCd="dv")
assert type(df) is DataFrame
assert df.size == 2472
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")

if gpd is not None:
if not isinstance(df, gpd.GeoDataFrame):
raise AssertionError(f"{type(df)} is not a GeoDataFrame")

geom_type = df.geom_type.unique()
if len(geom_type) > 1 or geom_type[0] != "Point":
raise AssertionError(
f"Geometry type {geom_type} not valid, expecting Point"
)

size += len(df)

assert df.size == size
assert_metadata(requests_mock, request_url, md, None, parameter_cd_list, format)


Expand All @@ -450,7 +518,8 @@ def test_get_stats(requests_mock):
mock_request(requests_mock, request_url, response_file_path)

df, md = get_stats(sites=["01491000", "01645000"])
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 51936
assert_metadata(requests_mock, request_url, md, None, None, format)

Expand All @@ -468,7 +537,8 @@ def test_get_stats_site_value_types(requests_mock, site_input_type_list):
else:
sites = site
df, md = get_stats(sites=sites)
assert type(df) is DataFrame
if not isinstance(df, DataFrame):
raise AssertionError(f"{type(df)} is not DataFrame base class type")
assert df.size == 51936


Expand All @@ -486,7 +556,10 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format):
with open('data/waterservices_site.txt') as text:
requests_mock.get(site_request_url, text=text.read())
site_info, _ = md.site_info
assert type(site_info) is DataFrame
if not isinstance(site_info, DataFrame):
raise AssertionError(
f"{type(site_info)} is not DataFrame base class type"
)
if parameter_cd is None:
assert md.variable_info is None
else:
Expand Down
Loading