Skip to content

Commit

Permalink
adding updates based on comments, support for geojson to dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
j-gillam committed Oct 30, 2023
1 parent 5c34618 commit fc7cd54
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 28 deletions.
95 changes: 81 additions & 14 deletions nesta_ds_utils/loading_saving/S3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import io
from typing import List
from xmlrpc.client import Boolean
import boto3
from fnmatch import fnmatch
import pandas as pd
Expand Down Expand Up @@ -51,11 +50,30 @@ def _df_to_fileobj(df_data: pd.DataFrame, path_to: str, **kwargs) -> io.BytesIO:
df_data.to_excel(buffer, **kwargs)
elif fnmatch(path_to, "*.xlsm"):
df_data.to_excel(buffer, **kwargs)
elif fnmatch(path_to, "*.geojson"):
else:
raise Exception(
"Uploading dataframe currently supported only for 'csv', 'parquet', 'xlsx' and xlsm'."
)
buffer.seek(0)
return buffer


def _gdf_to_fileobj(df_data: gpd.GeoDataFrame, path_to: str, **kwargs) -> io.BytesIO:
"""Convert GeoDataFrame into bytes file object.
Args:
df_data (gpd.DataFrame): Dataframe to convert.
path_to (str): Saving file name.
Returns:
io.BytesIO: Bytes file object.
"""
buffer = io.BytesIO()
if fnmatch(path_to, "*.geojson"):
df_data.to_file(buffer, driver="GeoJSON", **kwargs)
else:
raise Exception(
"Uploading dataframe currently supported only for 'csv', 'parquet', 'xlsx', xlsm' and 'geojson'."
"Uploading geodataframe currently supported only for 'geojson'."
)
buffer.seek(0)
return buffer
Expand All @@ -74,8 +92,30 @@ def _dict_to_fileobj(dict_data: dict, path_to: str, **kwargs) -> io.BytesIO:
buffer = io.BytesIO()
if fnmatch(path_to, "*.json"):
buffer.write(json.dumps(dict_data, **kwargs).encode())
elif fnmatch(path_to, "*.geojson"):
if "type" in dict_data:
if dict_data["type"] in [
"Point",
"MultiPoint",
"LineString",
"MultiLineString",
"Polygon",
"MultiPolygon",
"GeometryCollection",
"Feature",
"FeatureCollection",
]:
buffer.write(json.dumps(dict_data, **kwargs).encode())
else:
raise Exception(
"GeoJSONS must have a member with the name 'type', the value of the member must "
"be one of the following: 'Point', 'MultiPoint', 'LineString', 'MultiLineString',"
"'Polygon', 'MultiPolygon','GeometryCollection', 'Feature' or 'FeatureCollection'."
)
else:
raise Exception("Uploading dictionary currently supported only for 'json'.")
raise Exception(
"Uploading dictionary currently supported only for 'json' and 'geojson'."
)
buffer.seek(0)
return buffer

Expand Down Expand Up @@ -187,7 +227,9 @@ def upload_obj(
kwargs_writing (dict, optional): Dictionary of kwargs for writing data.
"""
if isinstance(obj, pd.DataFrame):
if isinstance(obj, gpd.GeoDataFrame):
obj = _gdf_to_fileobj(obj, path_to, **kwargs_writing)
elif isinstance(obj, pd.DataFrame):
obj = _df_to_fileobj(obj, path_to, **kwargs_writing)
elif isinstance(obj, dict):
obj = _dict_to_fileobj(obj, path_to, **kwargs_writing)
Expand All @@ -201,7 +243,7 @@ def upload_obj(
obj = _unsupp_data_to_fileobj(obj, path_to, **kwargs_writing)
warnings.warn(
"Data uploaded as pickle. Please consider other accessible "
"file types among the suppoted ones."
"file types among the supported ones."
)

s3 = boto3.client("s3")
Expand All @@ -226,7 +268,19 @@ def _fileobj_to_df(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFram
return pd.read_excel(fileobj, **kwargs)
elif fnmatch(path_from, "*.xlsm"):
return pd.read_excel(fileobj, **kwargs)
elif fnmatch(path_from, "*.geojson"):


def _fileobj_to_gdf(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFrame:
"""Convert bytes file object into geodataframe.
Args:
fileobj (io.BytesIO): Bytes file object.
path_from (str): Path of loaded data.
Returns:
gpd.DataFrame: Data as geodataframe.
"""
if fnmatch(path_from, "*.geojson"):
return gpd.GeoDataFrame.from_features(
json.loads(fileobj.getvalue().decode())["features"]
)
Expand Down Expand Up @@ -309,12 +363,12 @@ def download_obj(
bucket (str): Bucket's name.
path_from (str): Path to data in S3.
download_as (str, optional): Type of object downloading. Choose between
('dataframe', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'.
('dataframe', 'geodataframe', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'.
kwargs_boto (dict, optional): Dictionary of kwargs for boto3 function 'download_fileobj'.
kwargs_reading (dict, optional): Dictionary of kwargs for reading data.
Returns:
any: Donwloaded data.
any: Downloaded data.
"""
if not path_from.endswith(
tuple(
Expand All @@ -329,21 +383,34 @@ def download_obj(
s3.download_fileobj(bucket, path_from, fileobj, **kwargs_boto)
fileobj.seek(0)
if download_as == "dataframe":
if path_from.endswith(
tuple([".csv", ".parquet", ".xlsx", ".xlsm", ".geojson"])
):
if path_from.endswith(tuple([".csv", ".parquet", ".xlsx", ".xlsm"])):
return _fileobj_to_df(fileobj, path_from, **kwargs_reading)
else:
raise Exception(
"Download as dataframe currently supported only "
"for 'csv' and 'parquet'."
"for 'csv','parquet','xlsx' and 'xlsm'."
)
elif download_as == "geodataframe":
if path_from.endswith(tuple([".geojson"])):
return _fileobj_to_gdf(fileobj, path_from, **kwargs_reading)
else:
raise Exception(
"Download as geodataframe currently supported only " "for 'geojson'."
)
elif download_as == "dict":
if path_from.endswith(tuple([".json"])):
return _fileobj_to_dict(fileobj, path_from, **kwargs_reading)
elif path_from.endswith(tuple([".geojson"])):
warnings.warn(
"Please check geojson has a member with the name 'type', the value of the member must be one of the following:"
"'Point', 'MultiPoint', 'LineString', 'MultiLineString', 'Polygon', 'MultiPolygon', 'GeometryCollection',"
"'Feature' and 'FeatureCollection'. Else downloaded dictionary will not be valid geojson."
)
return _fileobj_to_dict(fileobj, path_from, **kwargs_reading)
else:
raise Exception(
"Download as dictionary currently supported only " "for 'json'."
"Download as dictionary currently supported only "
"for 'json' and 'geojson'."
)
elif download_as == "list":
if path_from.endswith(tuple([".csv", ".txt", ".json"])):
Expand Down
11 changes: 9 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ install_requires =
[options.extras_require]
s3 =
boto3==1.24.93
openpyxl==3.0.9
shapely==2.0.2
gis =
geopandas==0.13.2
io_extras =
openpyxl==3.0.9
viz =
altair==4.2.0
altair-saver==0.5.0
Expand All @@ -37,6 +38,8 @@ test =
pytest==7.1.3
moto[s3]==4.0.7
%(s3)s
%(gis)s
%(io_extras)s
%(viz)s
%(networks)s
%(nlp)s
Expand All @@ -54,11 +57,15 @@ dev =
pre-commit-hooks==4.3.0
black==22.10.0
%(s3)s
%(gis)s
%(io_extras)s
%(viz)s
%(networks)s
%(nlp)s
all =
%(s3)s
%(gis)s
%(io_extras)s
%(viz)s
%(networks)s
%(nlp)s
Expand Down
1 change: 1 addition & 0 deletions tests/artifacts/dummy_dict.geojson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}}
52 changes: 40 additions & 12 deletions tests/loading_saving/test_S3.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@

TEST_GEODATAFRAME = gpd.GeoDataFrame({'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]})
TEST_DATAFRAME = pd.DataFrame({"test": [0, 0]})
TEST_DICT_GEO = {"type": "FeatureCollection",
"features": [
{"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}}
],
"crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}
}
TEST_DICT = {"test": [0, 0]}
TEST_LIST = [0, "test"]
TEST_STR = "test"
Expand Down Expand Up @@ -88,6 +94,14 @@ def test_upload_obj_dict_json():
mock_client = boto3.client("s3")
S3.upload_obj(TEST_DICT, "test-bucket", "dummy.json")

@mock_s3
def test_upload_obj_dict_json():
"""Tests that upload_obj does not return an exeption."""
conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket="test-bucket")
mock_client = boto3.client("s3")
S3.upload_obj(TEST_DICT_GEO, "test-bucket", "dummy.geojson")


@mock_s3
def test_upload_obj_list_csv():
Expand Down Expand Up @@ -167,7 +181,7 @@ def test_upload_obj_unsup_data():


@mock_s3
def test_dowload_obj_dataframe_csv():
def test_download_obj_dataframe_csv():
"""Tests that download_obj returns the correct dataframe
from csv file.
"""
Expand Down Expand Up @@ -226,12 +240,12 @@ def test_download_obj_dataframe_geojson():
"tests/artifacts/dummy_dataframe.geojson", "test-bucket", "dummy.geojson"
)
assert (
S3.download_obj("test-bucket", "dummy.geojson", download_as="dataframe").geometry[0]
S3.download_obj("test-bucket", "dummy.geojson", download_as="geodataframe").geometry[0]
== Point(1, 2)
)

@mock_s3
def test_dowload_obj_dataframe_parquet():
def test_download_obj_dataframe_parquet():
"""Tests that download_obj returns the correct dataframe
from parquet file.
"""
Expand All @@ -248,7 +262,7 @@ def test_dowload_obj_dataframe_parquet():


@mock_s3
def test_dowload_obj_dict_json():
def test_download_obj_dict_json():
"""Tests that download_obj returns the correct dictionary
from json file.
"""
Expand All @@ -262,9 +276,23 @@ def test_dowload_obj_dict_json():
S3.download_obj("test-bucket", "dummy.json", download_as="dict")["test"][0] == 0
)

@mock_s3
def test_download_obj_dict_geojson():
"""Tests that download_obj returns the correct dictionary
from json file.
"""
conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket="test-bucket")
mock_client = boto3.client("s3")
mock_client.upload_file(
"tests/artifacts/dummy_dict.geojson", "test-bucket", "dummy.geojson"
)
assert (
S3.download_obj("test-bucket", "dummy.geojson", download_as="dict")["type"]=="FeatureCollection"
)

@mock_s3
def test_dowload_obj_list_csv():
def test_download_obj_list_csv():
"""Tests that download_obj returns the correct list
from csv file.
"""
Expand All @@ -278,7 +306,7 @@ def test_dowload_obj_list_csv():


@mock_s3
def test_dowload_obj_list_txt():
def test_download_obj_list_txt():
"""Tests that download_obj returns the correct list
from txt file.
"""
Expand All @@ -292,7 +320,7 @@ def test_dowload_obj_list_txt():


@mock_s3
def test_dowload_obj_list_json():
def test_download_obj_list_json():
"""Tests that download_obj returns the correct dataframe
from json file.
"""
Expand All @@ -309,7 +337,7 @@ def test_dowload_obj_list_json():


@mock_s3
def test_dowload_obj_str_txt():
def test_download_obj_str_txt():
"""Tests that download_obj returns the correct string
from txt file.
"""
Expand All @@ -321,7 +349,7 @@ def test_dowload_obj_str_txt():


@mock_s3
def test_dowload_obj_array_csv():
def test_download_obj_array_csv():
"""Tests that download_obj returns the correct numpy array
from csv file.
"""
Expand All @@ -335,7 +363,7 @@ def test_dowload_obj_array_csv():


@mock_s3
def test_dowload_obj_array_parquet():
def test_download_obj_array_parquet():
"""Tests that download_obj returns the correct numpy array
from parquet file.
"""
Expand All @@ -351,7 +379,7 @@ def test_dowload_obj_array_parquet():


@mock_s3
def test_dowload_obj_unsup_data():
def test_download_obj_unsup_data():
"""Tests that download_obj returns the correct integer
from pkl file.
"""
Expand All @@ -365,7 +393,7 @@ def test_dowload_obj_unsup_data():


@mock_s3
def test_dowload_obj_exeption():
def test_download_obj_exeption():
"""Tests that download_obj returns an exception for unsupported file type."""
conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket="test-bucket")
Expand Down

0 comments on commit fc7cd54

Please sign in to comment.