Switch pnnl-buildingid to ssh

SEED-platform · Jun 27, 2024 · 07d87fd · 07d87fd
1 parent 8236768
commit 07d87fd
Show file tree

Hide file tree

Showing 20 changed files with 4,682 additions and 86 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ packages = [{include = "utils"}]
 [tool.poetry.dependencies]
 python = ">=3.9, <3.13"
 
-pnnl-buildingid = {git = "https://github.com/seed-platform/buildingid.git", rev = "master"}
+pnnl-buildingid = {git = "git@github.com:SEED-platform/buildingid.git", rev = "master"}
 usaddress = "0.5.10"
 street-address = "0.4.0"
 geopandas = "^0.14.3"

diff --git a/tmp/data/OUO_Santa Monica All.xlsx b/tmp/data/OUO_Santa Monica All.xlsx
diff --git a/tmp/data/Santa Monica Covered Buildings with CoStar.xlsx b/tmp/data/Santa Monica Covered Buildings with CoStar.xlsx
diff --git a/tmp/data/Santa Monica Covered Buildings with Missing Data.xlsx b/tmp/data/Santa Monica Covered Buildings with Missing Data.xlsx
diff --git a/tmp/data/Santa Monica Covered Buildings.csv b/tmp/data/Santa Monica Covered Buildings.csv
diff --git a/tmp/data/Santa Monica Covered Buildings.xlsx b/tmp/data/Santa Monica Covered Buildings.xlsx
diff --git a/tmp/data/Santa Monica Footprints.xlsx b/tmp/data/Santa Monica Footprints.xlsx
diff --git a/tmp/lebanon.py b/tmp/lebanon.py
@@ -0,0 +1,3 @@
+from utils.shp_to_geojson import shp_to_geojson
+
+shp_to_geojson('lebanon_data/FootprintsUNH.shp')
diff --git a/tmp/lebanon_data/FootprintsUNH.cpg b/tmp/lebanon_data/FootprintsUNH.cpg
@@ -0,0 +1 @@
+UTF-8
diff --git a/tmp/lebanon_data/FootprintsUNH.dbf b/tmp/lebanon_data/FootprintsUNH.dbf
diff --git a/tmp/lebanon_data/FootprintsUNH.geojson b/tmp/lebanon_data/FootprintsUNH.geojson
diff --git a/tmp/lebanon_data/FootprintsUNH.prj b/tmp/lebanon_data/FootprintsUNH.prj
@@ -0,0 +1 @@
+PROJCS["NAD_1983_StatePlane_New_Hampshire_FIPS_2800_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-71.66666666666667],PARAMETER["Scale_Factor",0.9999666666666667],PARAMETER["Latitude_Of_Origin",42.5],UNIT["Foot_US",0.3048006096012192]]
diff --git a/tmp/lebanon_data/FootprintsUNH.sbn b/tmp/lebanon_data/FootprintsUNH.sbn
diff --git a/tmp/lebanon_data/FootprintsUNH.sbx b/tmp/lebanon_data/FootprintsUNH.sbx
diff --git a/tmp/lebanon_data/FootprintsUNH.shp b/tmp/lebanon_data/FootprintsUNH.shp
diff --git a/tmp/lebanon_data/FootprintsUNH.shp.xml b/tmp/lebanon_data/FootprintsUNH.shp.xml
diff --git a/tmp/lebanon_data/FootprintsUNH.shx b/tmp/lebanon_data/FootprintsUNH.shx
diff --git a/tmp/santa-monica-2.py b/tmp/santa-monica-2.py
@@ -0,0 +1,73 @@
+import contextlib
+import math
+import warnings
+
+import geopandas as gpd
+import pandas as pd
+import usaddress
+from shapely import wkt, Point
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+santa_monica = pd.read_csv('data/Santa Monica Covered Buildings.csv')
+costar = pd.read_excel('data/OUO_Santa Monica All.xlsx')
+
+footprints = [wkt.loads(x) for x in list(pd.read_excel('data/Santa Monica Footprints.xlsx')['geometry'])]
+footprints_gdf = gpd.GeoDataFrame(crs="epsg:4326", geometry=footprints)
+
+existing_costar_ids = set(map(int, santa_monica['CoStar ID'].dropna()))
+
+costar_50k = costar[costar['RBA'] >= 50000]
+missing_properties = costar_50k[~costar_50k['Building Status'].isin(['Demolished']) & ~costar_50k['PropertyID'].isin(set(costar_50k['PropertyID']) - existing_costar_ids)]
+
+for i, row in missing_properties.iterrows():
+    with contextlib.suppress(TypeError):
+        owner_address, _ = usaddress.tag(row['Owner City State Zip'])
+
+    new_row = pd.DataFrame([{
+        'Assessor Gross Floor Area': row['RBA'],
+        'Primary Property Type EPA Calculated': row['PropertyType'],
+        'Address Type': 'building',
+        'Street Address': row['Property Address'],
+        'City': row['City'],
+        'State Abbreviation': row['State'],
+        'Postal Code': row['Zip'],
+        'Address Type.1': 'owner',
+        'Name': row['Owner Name'],
+        'Street': row['Owner Address'],
+        'City.1': owner_address.get('PlaceName', ''),
+        'State Abbreviation.1': owner_address.get('StateName', ''),
+        'Postal Code.1': owner_address.get('ZipCode', ''),
+        'CoStar ID': row['PropertyID'],
+        'CoStar Address': row['Property Address'],
+        'Notes': 'Added missing costar address',
+    }], columns=santa_monica.columns)
+    santa_monica = pd.concat([santa_monica, new_row], ignore_index=True)
+
+# Add lat/long
+santa_monica['Latitude'] = None
+santa_monica['Longitude'] = None
+santa_monica['Footprint Match'] = None
+santa_monica['Footprint'] = None
+
+projected_crs = 'EPSG:32610'
+
+for i, row in santa_monica.iterrows():
+    costar_id = santa_monica['CoStar ID'][i]
+    if not math.isnan(costar_id):
+        costar_id = int(costar_id)
+        costar_property = costar[costar['PropertyID'] == costar_id].iloc[0]
+        santa_monica.at[i, 'Latitude'] = costar_property['Latitude']
+        santa_monica.at[i, 'Longitude'] = costar_property['Longitude']
+
+        point = Point(costar_property['Longitude'], costar_property['Latitude'])
+        point_gdf = gpd.GeoDataFrame(crs="epsg:4326", geometry=[point])
+        intersections = gpd.sjoin(point_gdf, footprints_gdf)
+        if len(intersections) >= 1:
+            santa_monica.at[i, 'Footprint'] = footprints_gdf.iloc[intersections.iloc[0].index_right].iloc[0].wkt
+            santa_monica.at[i, 'Footprint Match'] = "Intersection"
+        else:
+            santa_monica.at[i, 'Footprint'] = footprints_gdf.iloc[footprints_gdf.distance(point).idxmin()].iloc[0].wkt
+            santa_monica.at[i, 'Footprint Match'] = "Closest"
+
+santa_monica.to_excel('data/Santa Monica Covered Buildings with Missing Data.xlsx', index=False)
diff --git a/tmp/santa-monica.py b/tmp/santa-monica.py
@@ -0,0 +1,45 @@
+import sys
+from operator import itemgetter
+
+import pandas as pd
+from jarowinkler import jarowinkler_similarity
+
+from utils.normalize_address import normalize_address
+
+santa_monica = pd.read_excel('data/Santa Monica Covered Buildings.xlsx')
+costar = pd.read_excel('data/OUO_Santa Monica All.xlsx')
+
+santa_monica['CoStar Match'] = None
+santa_monica['CoStar ID'] = None
+santa_monica['CoStar Address'] = None
+
+normalized_addresses = list(map(normalize_address, santa_monica['Street Address']))
+normalized_costar_addresses = list(map(normalize_address, costar['Property Address']))
+
+exact_matches = 0
+for i, address in enumerate(normalized_addresses):
+    print('==========', address)
+    costar_matches = normalized_costar_addresses.count(address)
+    if costar_matches == 1:
+        print('  Found exact costar address:', address)
+        exact_matches += 1
+
+        costar_index = normalized_costar_addresses.index(address)
+        santa_monica.at[i, 'CoStar Match'] = 'Exact'
+        santa_monica.at[i, 'CoStar ID'] = costar.at[costar_index, 'PropertyID']
+        santa_monica.at[i, 'CoStar Address'] = costar.at[costar_index, 'Property Address']
+    elif costar_matches > 1:
+        print('  !!! Found multiple exact costar addresses')
+        santa_monica.at[i, 'CoStar Match'] = 'Multiple'
+    else:
+        closest_matches = sorted([(jarowinkler_similarity(address, costar_address), costar_address) for costar_address in normalized_costar_addresses], key=itemgetter(0), reverse=True)
+        print('  Found closest costar address:', closest_matches[0][1])
+
+        costar_index = normalized_costar_addresses.index(closest_matches[0][1])
+        santa_monica.at[i, 'CoStar Match'] = 'Closest'
+        santa_monica.at[i, 'CoStar ID'] = costar.at[costar_index, 'PropertyID']
+        santa_monica.at[i, 'CoStar Address'] = costar.at[costar_index, 'Property Address']
+
+print('Total:', len(normalized_addresses))
+print('Exact matches:', exact_matches)
+santa_monica.to_excel('data/Santa Monica Covered Buildings with CoStar.xlsx', index=False)