Skip to content

Commit

Permalink
Switch pnnl-buildingid to ssh
Browse files Browse the repository at this point in the history
  • Loading branch information
axelstudios committed Jun 27, 2024
1 parent 8236768 commit 07d87fd
Show file tree
Hide file tree
Showing 20 changed files with 4,682 additions and 86 deletions.
187 changes: 102 additions & 85 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ packages = [{include = "utils"}]
[tool.poetry.dependencies]
python = ">=3.9, <3.13"

pnnl-buildingid = {git = "https://github.com/seed-platform/buildingid.git", rev = "master"}
pnnl-buildingid = {git = "git@github.com:SEED-platform/buildingid.git", rev = "master"}
usaddress = "0.5.10"
street-address = "0.4.0"
geopandas = "^0.14.3"
Expand Down
Binary file added tmp/data/OUO_Santa Monica All.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
265 changes: 265 additions & 0 deletions tmp/data/Santa Monica Covered Buildings.csv

Large diffs are not rendered by default.

Binary file added tmp/data/Santa Monica Covered Buildings.xlsx
Binary file not shown.
Binary file added tmp/data/Santa Monica Footprints.xlsx
Binary file not shown.
3 changes: 3 additions & 0 deletions tmp/lebanon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from utils.shp_to_geojson import shp_to_geojson

shp_to_geojson('lebanon_data/FootprintsUNH.shp')
1 change: 1 addition & 0 deletions tmp/lebanon_data/FootprintsUNH.cpg
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
UTF-8
Binary file added tmp/lebanon_data/FootprintsUNH.dbf
Binary file not shown.
4,190 changes: 4,190 additions & 0 deletions tmp/lebanon_data/FootprintsUNH.geojson

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tmp/lebanon_data/FootprintsUNH.prj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PROJCS["NAD_1983_StatePlane_New_Hampshire_FIPS_2800_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-71.66666666666667],PARAMETER["Scale_Factor",0.9999666666666667],PARAMETER["Latitude_Of_Origin",42.5],UNIT["Foot_US",0.3048006096012192]]
Binary file added tmp/lebanon_data/FootprintsUNH.sbn
Binary file not shown.
Binary file added tmp/lebanon_data/FootprintsUNH.sbx
Binary file not shown.
Binary file added tmp/lebanon_data/FootprintsUNH.shp
Binary file not shown.
1 change: 1 addition & 0 deletions tmp/lebanon_data/FootprintsUNH.shp.xml

Large diffs are not rendered by default.

Binary file added tmp/lebanon_data/FootprintsUNH.shx
Binary file not shown.
73 changes: 73 additions & 0 deletions tmp/santa-monica-2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import contextlib
import math
import warnings

import geopandas as gpd
import pandas as pd
import usaddress
from shapely import wkt, Point

warnings.filterwarnings("ignore", category=UserWarning)

santa_monica = pd.read_csv('data/Santa Monica Covered Buildings.csv')
costar = pd.read_excel('data/OUO_Santa Monica All.xlsx')

footprints = [wkt.loads(x) for x in list(pd.read_excel('data/Santa Monica Footprints.xlsx')['geometry'])]
footprints_gdf = gpd.GeoDataFrame(crs="epsg:4326", geometry=footprints)

existing_costar_ids = set(map(int, santa_monica['CoStar ID'].dropna()))

costar_50k = costar[costar['RBA'] >= 50000]
missing_properties = costar_50k[~costar_50k['Building Status'].isin(['Demolished']) & ~costar_50k['PropertyID'].isin(set(costar_50k['PropertyID']) - existing_costar_ids)]

for i, row in missing_properties.iterrows():
with contextlib.suppress(TypeError):
owner_address, _ = usaddress.tag(row['Owner City State Zip'])

new_row = pd.DataFrame([{
'Assessor Gross Floor Area': row['RBA'],
'Primary Property Type EPA Calculated': row['PropertyType'],
'Address Type': 'building',
'Street Address': row['Property Address'],
'City': row['City'],
'State Abbreviation': row['State'],
'Postal Code': row['Zip'],
'Address Type.1': 'owner',
'Name': row['Owner Name'],
'Street': row['Owner Address'],
'City.1': owner_address.get('PlaceName', ''),
'State Abbreviation.1': owner_address.get('StateName', ''),
'Postal Code.1': owner_address.get('ZipCode', ''),
'CoStar ID': row['PropertyID'],
'CoStar Address': row['Property Address'],
'Notes': 'Added missing costar address',
}], columns=santa_monica.columns)
santa_monica = pd.concat([santa_monica, new_row], ignore_index=True)

# Add lat/long
santa_monica['Latitude'] = None
santa_monica['Longitude'] = None
santa_monica['Footprint Match'] = None
santa_monica['Footprint'] = None

projected_crs = 'EPSG:32610'

for i, row in santa_monica.iterrows():
costar_id = santa_monica['CoStar ID'][i]
if not math.isnan(costar_id):
costar_id = int(costar_id)
costar_property = costar[costar['PropertyID'] == costar_id].iloc[0]
santa_monica.at[i, 'Latitude'] = costar_property['Latitude']
santa_monica.at[i, 'Longitude'] = costar_property['Longitude']

point = Point(costar_property['Longitude'], costar_property['Latitude'])
point_gdf = gpd.GeoDataFrame(crs="epsg:4326", geometry=[point])
intersections = gpd.sjoin(point_gdf, footprints_gdf)
if len(intersections) >= 1:
santa_monica.at[i, 'Footprint'] = footprints_gdf.iloc[intersections.iloc[0].index_right].iloc[0].wkt
santa_monica.at[i, 'Footprint Match'] = "Intersection"
else:
santa_monica.at[i, 'Footprint'] = footprints_gdf.iloc[footprints_gdf.distance(point).idxmin()].iloc[0].wkt
santa_monica.at[i, 'Footprint Match'] = "Closest"

santa_monica.to_excel('data/Santa Monica Covered Buildings with Missing Data.xlsx', index=False)
45 changes: 45 additions & 0 deletions tmp/santa-monica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys
from operator import itemgetter

import pandas as pd
from jarowinkler import jarowinkler_similarity

from utils.normalize_address import normalize_address

santa_monica = pd.read_excel('data/Santa Monica Covered Buildings.xlsx')
costar = pd.read_excel('data/OUO_Santa Monica All.xlsx')

santa_monica['CoStar Match'] = None
santa_monica['CoStar ID'] = None
santa_monica['CoStar Address'] = None

normalized_addresses = list(map(normalize_address, santa_monica['Street Address']))
normalized_costar_addresses = list(map(normalize_address, costar['Property Address']))

exact_matches = 0
for i, address in enumerate(normalized_addresses):
print('==========', address)
costar_matches = normalized_costar_addresses.count(address)
if costar_matches == 1:
print(' Found exact costar address:', address)
exact_matches += 1

costar_index = normalized_costar_addresses.index(address)
santa_monica.at[i, 'CoStar Match'] = 'Exact'
santa_monica.at[i, 'CoStar ID'] = costar.at[costar_index, 'PropertyID']
santa_monica.at[i, 'CoStar Address'] = costar.at[costar_index, 'Property Address']
elif costar_matches > 1:
print(' !!! Found multiple exact costar addresses')
santa_monica.at[i, 'CoStar Match'] = 'Multiple'
else:
closest_matches = sorted([(jarowinkler_similarity(address, costar_address), costar_address) for costar_address in normalized_costar_addresses], key=itemgetter(0), reverse=True)
print(' Found closest costar address:', closest_matches[0][1])

costar_index = normalized_costar_addresses.index(closest_matches[0][1])
santa_monica.at[i, 'CoStar Match'] = 'Closest'
santa_monica.at[i, 'CoStar ID'] = costar.at[costar_index, 'PropertyID']
santa_monica.at[i, 'CoStar Address'] = costar.at[costar_index, 'Property Address']

print('Total:', len(normalized_addresses))
print('Exact matches:', exact_matches)
santa_monica.to_excel('data/Santa Monica Covered Buildings with CoStar.xlsx', index=False)

0 comments on commit 07d87fd

Please sign in to comment.