Skip to content
This repository has been archived by the owner on Jun 30, 2023. It is now read-only.

Integrate geocode into build #599

Draft
wants to merge 38 commits into
base: 563-Add-Devcontainer
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d1b46a1
quite stuck, df.to_sql isn't working
SashaWeinstein Oct 13, 2022
fe46188
geocoding during build seems to work. Next implement similar process …
SashaWeinstein Oct 14, 2022
70d1e92
HNY_geo populates, but HNY_devdb does not. I suspect some sort of typ…
SashaWeinstein Oct 17, 2022
e5f07f8
format with black
SashaWeinstein Oct 17, 2022
e0bdd28
create HNY_devdb with inner join
SashaWeinstein Oct 18, 2022
3631916
clean print statements and format
SashaWeinstein Oct 18, 2022
9cb2a58
install minio
SashaWeinstein Oct 21, 2022
a568df7
this docker compose doesn't work yet but it's a good start
SashaWeinstein Oct 21, 2022
c7a4690
fix minio installation and add poetry to path
td928 Nov 9, 2022
229fe15
add poetry path to make sure poetry command works
td928 Nov 9, 2022
133cb38
remove import geocoding files and add wait geocoding process
td928 Nov 10, 2022
6415523
dockerfile for actions for poetry path
td928 Nov 10, 2022
a5d1587
add different poetry path
td928 Nov 10, 2022
5a95902
use path that works locally
td928 Nov 10, 2022
f656df5
specify poetry install location
td928 Nov 10, 2022
bcc575d
print path
td928 Nov 10, 2022
32ded07
add path to binary
td928 Nov 10, 2022
b591cda
add poetry installation in test.yml
td928 Nov 10, 2022
df8def1
poetry install
td928 Nov 10, 2022
b12c85e
update python
td928 Nov 10, 2022
00b2438
docker compose status
td928 Nov 10, 2022
7d5ab84
add geosupoort container to actions
td928 Nov 10, 2022
6eaedad
remove sudo and break up steps
td928 Nov 10, 2022
bf526bb
remove docker command
td928 Nov 10, 2022
ef8c361
add postsql install
td928 Nov 10, 2022
316356f
change to port 5432
td928 Nov 10, 2022
d263ba0
remove space and change it back to 25060
td928 Nov 10, 2022
e3ae085
give build engine a different port
td928 Nov 14, 2022
fb1316e
use postgres not local host
td928 Nov 14, 2022
9b19b5b
remove apt update
td928 Nov 14, 2022
2017132
add jq
td928 Nov 14, 2022
27bcf8d
add localhost to arg for postgis image
td928 Nov 14, 2022
4a5ee46
remove docker command
td928 Nov 14, 2022
64e94a4
specify different build engine
td928 Dec 2, 2022
569f843
use localhost
td928 Dec 2, 2022
62b8c4c
use ubuntu latest
td928 Dec 2, 2022
78026ec
specify user and db
td928 Dec 2, 2022
0574380
replace localhost
td928 Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion bash/02_build_devdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ psql $BUILD_ENGINE -f sql/now/_init.sql
psql $BUILD_ENGINE -f sql/_init.sql
count _INIT_devdb

display "Assign geoms to _GEO_devdb and create GEO_devdb"
display "Geocoding DOB records"
poetry run python3 -m python.geocode

# display "Assign geoms to _GEO_devdb and create GEO_devdb"
psql $BUILD_ENGINE -f sql/_geo.sql
psql $BUILD_ENGINE -f sql/_geo_corrections.sql
count GEO_devdb
Expand Down Expand Up @@ -85,6 +88,9 @@ display "Combining _MID_devdb with STATUS_devdb to create MID_devdb,
psql $BUILD_ENGINE -f sql/mid.sql
count MID_devdb

display "Geocoding HNY records"
poetry run python3 -m python.geocode_hny

display "Creating HNY fields:
hny_id,
classa_hnyaff,
Expand Down
73 changes: 29 additions & 44 deletions python/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from python.utils import psql_insert_copy
import pandas as pd
import os
from tqdm import tqdm
from dotenv import main

main.load_dotenv()

g = Geosupport()

OUTPUT_TABLE_NAME = "_init_geocoded"


def geocode(input):
# collect inputs
Expand Down Expand Up @@ -84,74 +85,58 @@ def parse_output(geo):
)


def load_applications(engine):
def load_init_devdb(engine):
df = pd.read_sql(
"""
SELECT
uid,
regexp_replace(
trim(house_number),
'(^|)0*', '', ''
) as house_number,
REGEXP_REPLACE(street_name, '[\s]{2,}' ,' ' , 'g') as street_name,
borough,
source
FROM (
SELECT
distinct ogc_fid as uid,
housenumber as house_number,
streetname as street_name,
borough,
'bis' as source
FROM dob_jobapplications UNION
SELECT
distinct ogc_fid as uid,
house_no as house_number,
street_name as street_name,
borough,
'now' as source
FROM dob_now_applications
) a LIMIT 400000
job_number,
address_numbr as house_number,
REGEXP_REPLACE(address_street, '[\s]{2,}' ,' ' , 'g') as street_name,
boro as borough
FROM _INIT_devdb
""",
engine,
)

print("loaded df from database")
return df

def geocode_insert_sql(df):
records = df.to_dict("records")

def geocode_insert_sql(records, engine):

# Multiprocess
with Pool(processes=cpu_count()) as pool:
it = tqdm(pool.map(geocode, records, 1000))
# it = tqdm(list(map(geocode, records)))
it = pool.map(geocode, records, len(records) // 4)

df = pd.DataFrame(it)
df.replace({"latitude": {"": None}, "longitude": {"": None}}, inplace=True)
df.to_sql(
"dob_geocode_results",
OUTPUT_TABLE_NAME,
con=engine,
if_exists="append",
index=False,
method=psql_insert_copy,
)


def clear_dob_geocode_results(engine):
engine.execute("DROP TABLE IF EXISTS dob_geocode_results")
engine.execute(f"DROP TABLE IF EXISTS {OUTPUT_TABLE_NAME}")


if __name__ == "__main__":
# connect to BUILD_ENGINE
engine = create_engine(os.environ["BUILD_ENGINE"])

df = load_applications(engine)
clear_dob_geocode_results(engine)
# df = df.iloc[:2000,:]
start =0
chunk_size = 50000
end = chunk_size
while end <= df.shape[0]:
print(f"geocoding records {start} through {end}")
geocode_insert_sql(df.iloc[start:end,:])
start = end
end = min(end+chunk_size, df.shape[0])


df = load_init_devdb(engine)
records = df.to_dict("records")

del df
start = 0
chunk_size = 10**4
end = min(chunk_size, len(records))
while start < len(records):
print(f"geocoding records {start} through {end}")
geocode_insert_sql(records[start:end], engine)
start = end
end = min(end + chunk_size, len(records))
32 changes: 18 additions & 14 deletions python/geocode_hny.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,37 @@
g = Geosupport()


def geocode(input):
def geocode(hny):
# collect inputs
uid = str(input.pop("ogc_fid"))
hnum = input.pop("number")
sname = input.pop("street")
borough = input.pop("borough")
uid = str(hny.get("ogc_fid"))
hnum = hny.get("number")
sname = hny.get("street")
borough = hny.get("borough")

try:
geo = g["1B"](
street_name=sname, house_number=hnum, borough=borough, mode="regular"
)
geo = parse_output(geo)
geo = add_geocode(hny, geo)
geo.update(dict(uid=uid, mode="regular", func="1B", status="success"))
return geo
except GeosupportError:
try:
geo = g["1B"](
street_name=sname, house_number=hnum, borough=borough, mode="tpad"
)
geo = parse_output(geo)
geo = add_geocode(hny, geo)
geo.update(dict(uid=uid, mode="tpad", func="1B", status="success"))
return geo
except GeosupportError as e:
geo = parse_output(e.result)
geo = add_geocode(hny, e.result)
geo.update(uid=uid, mode="tpad", func="1B", status="failure")
return geo


def parse_output(geo):
return dict(
def add_geocode(hny, geo):

new_fields = dict(
# Normalized address:
geo_sname=geo.get("First Street Name Normalized", ""),
geo_hnum=geo.get("House Number - Display Format", ""),
Expand All @@ -53,10 +54,13 @@ def parse_output(geo):
"Building Identification Number (BIN) of Input Address or NAP", ""
),
geo_bbl=geo.get("BOROUGH BLOCK LOT (BBL)", {}).get(
"BOROUGH BLOCK LOT (BBL)", "",
"BOROUGH BLOCK LOT (BBL)",
"",
),
)

return hny | new_fields


if __name__ == "__main__":
# connect to postgres db
Expand Down Expand Up @@ -89,11 +93,11 @@ def parse_output(geo):
it = pool.map(geocode, records, 1000)

print("Geocoding finished, dumping to postgres ...")
df=pd.DataFrame(it)
df = pd.DataFrame(it)
df.to_sql(
'hny_geocode_results',
"hny_geocode_results",
con=engine,
if_exists="replace",
index=False,
method=psql_insert_copy,
)
)
9 changes: 3 additions & 6 deletions sql/_geo.sql
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,8 @@ DRAFT as (
b.longitude::double precision as geo_longitude,
b.mode
FROM _INIT_devdb a
LEFT JOIN _GEO_devdb b
ON (CASE
WHEN source = 'bis' THEN b.uid::text
ELSE (b.uid::integer + (SELECT MAX(_INIT_BIS_devdb.uid::integer) FROM _INIT_BIS_devdb))::text
END)::text = a.uid::text
LEFT JOIN _init_geocoded b
ON a.uid = b.uid
),
GEOM_dob_bin_bldgfootprints as (
SELECT distinct
Expand Down Expand Up @@ -253,7 +250,7 @@ SELECT
INTO GEO_devdb
FROM DRAFT a
LEFT JOIN GEOM_dob_latlon b
ON a.uid = b.uid;
ON a.uid::text = b.uid::text;

-- Create index
CREATE INDEX GEO_devdb_geom_idx ON GEO_devdb
Expand Down
45 changes: 19 additions & 26 deletions sql/_hny.sql
Original file line number Diff line number Diff line change
Expand Up @@ -121,24 +121,17 @@ CREATE TABLE IF NOT EXISTS CORR_hny_matches (
DROP TABLE IF EXISTS HNY_geo;
-- 1) Merge with geocoding results and create a unique ID
WITH hny AS (
SELECT a.project_id||'/'||COALESCE(LPAD(a.building_id, 6, '0'), '') as hny_id,
a.project_id as hny_project_id,
a.*,
b.geo_bbl,
b.geo_bin,
b.geo_latitude,
b.geo_longitude,
(CASE WHEN b.geo_longitude IS NOT NULL
AND b.geo_latitude IS NOT NULL
THEN ST_SetSRID(ST_MakePoint(b.geo_longitude::NUMERIC,
b.geo_latitude::NUMERIC),4326)
SELECT project_id||'/'||COALESCE(LPAD(building_id, 6, '0'), '') as hny_id,
project_id as hny_project_id,
*,

(CASE WHEN geo_longitude IS NOT NULL
AND geo_latitude IS NOT NULL
THEN ST_SetSRID(ST_MakePoint(geo_longitude::NUMERIC,
geo_latitude::NUMERIC),4326)
ELSE NULL
END) AS geom
FROM hpd_hny_units_by_building a
JOIN hny_geocode_results b
ON a.ogc_fid::text = b.uid
WHERE a.reporting_construction_type = 'New Construction'
AND a.project_name <> 'CONFIDENTIAL')
FROM hny_geocode_results)

SELECT *
INTO HNY_geo
Expand Down Expand Up @@ -420,17 +413,17 @@ WITH
one_dev_to_many_hny,
one_hny_to_many_dev
FROM RELATEFLAGS_hny_matches a
WHERE one_hny_to_many_dev = 1),
WHERE one_hny_to_many_dev = 1)

-- Combine into a single look-up table
HNY_lookup AS(
SELECT * FROM one_to_one
UNION
SELECT * FROM one_to_many
-- Many-to-many cases are further resolved in many_to_one table, so don't include
WHERE job_number||hny_id NOT IN (SELECT job_number||hny_id FROM many_to_one)
UNION
SELECT * FROM many_to_one)
SELECT * INTO HNY_lookup FROM (
SELECT * FROM one_to_one
UNION
SELECT * FROM one_to_many
-- Many-to-many cases are further resolved in many_to_one table, so don't include
WHERE job_number||hny_id NOT IN (SELECT job_number||hny_id FROM many_to_one)
UNION
SELECT * FROM many_to_one) as tmp;


-- 7) MERGE WITH devdb
Expand All @@ -447,7 +440,7 @@ SELECT a.job_number,
END) AS hny_jobrelate
INTO HNY_devdb
FROM MID_devdb a
LEFT JOIN HNY_lookup b
INNER JOIN HNY_lookup b
ON a.job_number = b.job_number;