adding updates based on comments, support for geojson to dictionary

nestauk · Oct 30, 2023 · fc7cd54 · fc7cd54
1 parent 5c34618
commit fc7cd54
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 28 deletions.
diff --git a/nesta_ds_utils/loading_saving/S3.py b/nesta_ds_utils/loading_saving/S3.py
@@ -1,6 +1,5 @@
 import io
 from typing import List
-from xmlrpc.client import Boolean
 import boto3
 from fnmatch import fnmatch
 import pandas as pd
@@ -51,11 +50,30 @@ def _df_to_fileobj(df_data: pd.DataFrame, path_to: str, **kwargs) -> io.BytesIO:
         df_data.to_excel(buffer, **kwargs)
     elif fnmatch(path_to, "*.xlsm"):
         df_data.to_excel(buffer, **kwargs)
-    elif fnmatch(path_to, "*.geojson"):
+    else:
+        raise Exception(
+            "Uploading dataframe currently supported only for 'csv', 'parquet', 'xlsx' and xlsm'."
+        )
+    buffer.seek(0)
+    return buffer
+
+
+def _gdf_to_fileobj(df_data: gpd.GeoDataFrame, path_to: str, **kwargs) -> io.BytesIO:
+    """Convert GeoDataFrame into bytes file object.
+
+    Args:
+        df_data (gpd.DataFrame): Dataframe to convert.
+        path_to (str): Saving file name.
+
+    Returns:
+        io.BytesIO: Bytes file object.
+    """
+    buffer = io.BytesIO()
+    if fnmatch(path_to, "*.geojson"):
         df_data.to_file(buffer, driver="GeoJSON", **kwargs)
     else:
         raise Exception(
-            "Uploading dataframe currently supported only for 'csv', 'parquet', 'xlsx', xlsm' and 'geojson'."
+            "Uploading geodataframe currently supported only for 'geojson'."
         )
     buffer.seek(0)
     return buffer
@@ -74,8 +92,30 @@ def _dict_to_fileobj(dict_data: dict, path_to: str, **kwargs) -> io.BytesIO:
     buffer = io.BytesIO()
     if fnmatch(path_to, "*.json"):
         buffer.write(json.dumps(dict_data, **kwargs).encode())
+    elif fnmatch(path_to, "*.geojson"):
+        if "type" in dict_data:
+            if dict_data["type"] in [
+                "Point",
+                "MultiPoint",
+                "LineString",
+                "MultiLineString",
+                "Polygon",
+                "MultiPolygon",
+                "GeometryCollection",
+                "Feature",
+                "FeatureCollection",
+            ]:
+                buffer.write(json.dumps(dict_data, **kwargs).encode())
+            else:
+                raise Exception(
+                    "GeoJSONS must have a member with the name 'type', the value of the member must "
+                    "be one of the following: 'Point', 'MultiPoint', 'LineString', 'MultiLineString',"
+                    "'Polygon', 'MultiPolygon','GeometryCollection', 'Feature' or 'FeatureCollection'."
+                )
     else:
-        raise Exception("Uploading dictionary currently supported only for 'json'.")
+        raise Exception(
+            "Uploading dictionary currently supported only for 'json' and 'geojson'."
+        )
     buffer.seek(0)
     return buffer
 
@@ -187,7 +227,9 @@ def upload_obj(
         kwargs_writing (dict, optional): Dictionary of kwargs for writing data.
 
     """
-    if isinstance(obj, pd.DataFrame):
+    if isinstance(obj, gpd.GeoDataFrame):
+        obj = _gdf_to_fileobj(obj, path_to, **kwargs_writing)
+    elif isinstance(obj, pd.DataFrame):
         obj = _df_to_fileobj(obj, path_to, **kwargs_writing)
     elif isinstance(obj, dict):
         obj = _dict_to_fileobj(obj, path_to, **kwargs_writing)
@@ -201,7 +243,7 @@ def upload_obj(
         obj = _unsupp_data_to_fileobj(obj, path_to, **kwargs_writing)
         warnings.warn(
             "Data uploaded as pickle. Please consider other accessible "
-            "file types among the suppoted ones."
+            "file types among the supported ones."
         )
 
     s3 = boto3.client("s3")
@@ -226,7 +268,19 @@ def _fileobj_to_df(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFram
         return pd.read_excel(fileobj, **kwargs)
     elif fnmatch(path_from, "*.xlsm"):
         return pd.read_excel(fileobj, **kwargs)
-    elif fnmatch(path_from, "*.geojson"):
+
+
+def _fileobj_to_gdf(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFrame:
+    """Convert bytes file object into geodataframe.
+
+    Args:
+        fileobj (io.BytesIO): Bytes file object.
+        path_from (str): Path of loaded data.
+
+    Returns:
+        gpd.DataFrame: Data as geodataframe.
+    """
+    if fnmatch(path_from, "*.geojson"):
         return gpd.GeoDataFrame.from_features(
             json.loads(fileobj.getvalue().decode())["features"]
         )
@@ -309,12 +363,12 @@ def download_obj(
         bucket (str): Bucket's name.
         path_from (str): Path to data in S3.
         download_as (str, optional): Type of object downloading. Choose between
-        ('dataframe', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'.
+        ('dataframe', 'geodataframe', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'.
         kwargs_boto (dict, optional): Dictionary of kwargs for boto3 function 'download_fileobj'.
         kwargs_reading (dict, optional): Dictionary of kwargs for reading data.
 
     Returns:
-        any: Donwloaded data.
+        any: Downloaded data.
     """
     if not path_from.endswith(
         tuple(
@@ -329,21 +383,34 @@ def download_obj(
     s3.download_fileobj(bucket, path_from, fileobj, **kwargs_boto)
     fileobj.seek(0)
     if download_as == "dataframe":
-        if path_from.endswith(
-            tuple([".csv", ".parquet", ".xlsx", ".xlsm", ".geojson"])
-        ):
+        if path_from.endswith(tuple([".csv", ".parquet", ".xlsx", ".xlsm"])):
             return _fileobj_to_df(fileobj, path_from, **kwargs_reading)
         else:
             raise Exception(
                 "Download as dataframe currently supported only "
-                "for 'csv' and 'parquet'."
+                "for 'csv','parquet','xlsx' and 'xlsm'."
+            )
+    elif download_as == "geodataframe":
+        if path_from.endswith(tuple([".geojson"])):
+            return _fileobj_to_gdf(fileobj, path_from, **kwargs_reading)
+        else:
+            raise Exception(
+                "Download as geodataframe currently supported only " "for 'geojson'."
             )
     elif download_as == "dict":
         if path_from.endswith(tuple([".json"])):
             return _fileobj_to_dict(fileobj, path_from, **kwargs_reading)
+        elif path_from.endswith(tuple([".geojson"])):
+            warnings.warn(
+                "Please check geojson has a member with the name 'type', the value of the member must be one of the following:"
+                "'Point', 'MultiPoint', 'LineString', 'MultiLineString', 'Polygon', 'MultiPolygon', 'GeometryCollection',"
+                "'Feature' and 'FeatureCollection'. Else downloaded dictionary will not be valid geojson."
+            )
+            return _fileobj_to_dict(fileobj, path_from, **kwargs_reading)
         else:
             raise Exception(
-                "Download as dictionary currently supported only " "for 'json'."
+                "Download as dictionary currently supported only "
+                "for 'json' and 'geojson'."
             )
     elif download_as == "list":
         if path_from.endswith(tuple([".csv", ".txt", ".json"])):

diff --git a/setup.cfg b/setup.cfg
@@ -20,9 +20,10 @@ install_requires =
 [options.extras_require]
 s3 =
   boto3==1.24.93
-  openpyxl==3.0.9
-  shapely==2.0.2
+gis =
   geopandas==0.13.2
+io_extras = 
+  openpyxl==3.0.9
 viz =
   altair==4.2.0
   altair-saver==0.5.0
@@ -37,6 +38,8 @@ test =
   pytest==7.1.3
   moto[s3]==4.0.7
   %(s3)s
+  %(gis)s
+  %(io_extras)s
   %(viz)s
   %(networks)s
   %(nlp)s
@@ -54,11 +57,15 @@ dev =
   pre-commit-hooks==4.3.0
   black==22.10.0
   %(s3)s
+  %(gis)s
+  %(io_extras)s
   %(viz)s
   %(networks)s
   %(nlp)s
 all =
   %(s3)s
+  %(gis)s
+  %(io_extras)s
   %(viz)s
   %(networks)s
   %(nlp)s

diff --git a/tests/artifacts/dummy_dict.geojson b/tests/artifacts/dummy_dict.geojson
@@ -0,0 +1 @@
+{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}}
diff --git a/tests/loading_saving/test_S3.py b/tests/loading_saving/test_S3.py
@@ -13,6 +13,12 @@
 
 TEST_GEODATAFRAME = gpd.GeoDataFrame({'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]})
 TEST_DATAFRAME = pd.DataFrame({"test": [0, 0]})
+TEST_DICT_GEO = {"type": "FeatureCollection", 
+                 "features": [
+                     {"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}}
+                     ], 
+                     "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}
+                     }
 TEST_DICT = {"test": [0, 0]}
 TEST_LIST = [0, "test"]
 TEST_STR = "test"
@@ -88,6 +94,14 @@ def test_upload_obj_dict_json():
     mock_client = boto3.client("s3")
     S3.upload_obj(TEST_DICT, "test-bucket", "dummy.json")
 
+@mock_s3
+def test_upload_obj_dict_json():
+    """Tests that upload_obj does not return an exeption."""
+    conn = boto3.resource("s3", region_name="us-east-1")
+    conn.create_bucket(Bucket="test-bucket")
+    mock_client = boto3.client("s3")
+    S3.upload_obj(TEST_DICT_GEO, "test-bucket", "dummy.geojson")
+
 
 @mock_s3
 def test_upload_obj_list_csv():
@@ -167,7 +181,7 @@ def test_upload_obj_unsup_data():
 
 
 @mock_s3
-def test_dowload_obj_dataframe_csv():
+def test_download_obj_dataframe_csv():
     """Tests that download_obj returns the correct dataframe
     from csv file.
     """
@@ -226,12 +240,12 @@ def test_download_obj_dataframe_geojson():
         "tests/artifacts/dummy_dataframe.geojson", "test-bucket", "dummy.geojson"
     )
     assert (
-        S3.download_obj("test-bucket", "dummy.geojson", download_as="dataframe").geometry[0] 
+        S3.download_obj("test-bucket", "dummy.geojson", download_as="geodataframe").geometry[0] 
         == Point(1, 2)
     )
 
 @mock_s3
-def test_dowload_obj_dataframe_parquet():
+def test_download_obj_dataframe_parquet():
     """Tests that download_obj returns the correct dataframe
     from parquet file.
     """
@@ -248,7 +262,7 @@ def test_dowload_obj_dataframe_parquet():
 
 
 @mock_s3
-def test_dowload_obj_dict_json():
+def test_download_obj_dict_json():
     """Tests that download_obj returns the correct dictionary
     from json file.
     """
@@ -262,9 +276,23 @@ def test_dowload_obj_dict_json():
         S3.download_obj("test-bucket", "dummy.json", download_as="dict")["test"][0] == 0
     )
 
+@mock_s3
+def test_download_obj_dict_geojson():
+    """Tests that download_obj returns the correct dictionary
+    from json file.
+    """
+    conn = boto3.resource("s3", region_name="us-east-1")
+    conn.create_bucket(Bucket="test-bucket")
+    mock_client = boto3.client("s3")
+    mock_client.upload_file(
+        "tests/artifacts/dummy_dict.geojson", "test-bucket", "dummy.geojson"
+    )
+    assert (
+        S3.download_obj("test-bucket", "dummy.geojson", download_as="dict")["type"]=="FeatureCollection"
+    )
 
 @mock_s3
-def test_dowload_obj_list_csv():
+def test_download_obj_list_csv():
     """Tests that download_obj returns the correct list
     from csv file.
     """
@@ -278,7 +306,7 @@ def test_dowload_obj_list_csv():
 
 
 @mock_s3
-def test_dowload_obj_list_txt():
+def test_download_obj_list_txt():
     """Tests that download_obj returns the correct list
     from txt file.
     """
@@ -292,7 +320,7 @@ def test_dowload_obj_list_txt():
 
 
 @mock_s3
-def test_dowload_obj_list_json():
+def test_download_obj_list_json():
     """Tests that download_obj returns the correct dataframe
     from json file.
     """
@@ -309,7 +337,7 @@ def test_dowload_obj_list_json():
 
 
 @mock_s3
-def test_dowload_obj_str_txt():
+def test_download_obj_str_txt():
     """Tests that download_obj returns the correct string
     from txt file.
     """
@@ -321,7 +349,7 @@ def test_dowload_obj_str_txt():
 
 
 @mock_s3
-def test_dowload_obj_array_csv():
+def test_download_obj_array_csv():
     """Tests that download_obj returns the correct numpy array
     from csv file.
     """
@@ -335,7 +363,7 @@ def test_dowload_obj_array_csv():
 
 
 @mock_s3
-def test_dowload_obj_array_parquet():
+def test_download_obj_array_parquet():
     """Tests that download_obj returns the correct numpy array
     from parquet file.
     """
@@ -351,7 +379,7 @@ def test_dowload_obj_array_parquet():
 
 
 @mock_s3
-def test_dowload_obj_unsup_data():
+def test_download_obj_unsup_data():
     """Tests that download_obj returns the correct integer
     from pkl file.
     """
@@ -365,7 +393,7 @@ def test_dowload_obj_unsup_data():
 
 
 @mock_s3
-def test_dowload_obj_exeption():
+def test_download_obj_exeption():
     """Tests that download_obj returns an exception for unsupported file type."""
     conn = boto3.resource("s3", region_name="us-east-1")
     conn.create_bucket(Bucket="test-bucket")