Skip to content

Commit

Permalink
Update portal data and some additional one time updates
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasyu888 committed Oct 17, 2023
1 parent cd2b841 commit b8ad52c
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ The GENIE public releases are loaded into snowflake via this [script](admin/geni
snowflake, synapseclient and dotenv must be installed as dependencies.

```
pip install pip install "snowflake-connector-python[pandas]" "synapseclient[pandas]" python-dotenv
pip install "snowflake-connector-python[pandas]" "synapseclient[pandas]" python-dotenv
```
2 changes: 2 additions & 0 deletions admin/sage_setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ USE DATABASE sage;

CREATE SCHEMA IF NOT EXISTS portal_raw
WITH MANAGED ACCESS;
-- ! One time addition
-- ALTER TABLE portal_raw.NF ADD COLUMN "tissue" STRING;

USE ROLE securityadmin;

Expand Down
12 changes: 12 additions & 0 deletions elt/portal_elt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import synapseclient
import pandas as pd


syn = synapseclient.login()
Expand Down Expand Up @@ -40,6 +41,10 @@
"""
cs.execute(find_table_query)
opt = cs.fetch_pandas_all()
# * Get the existing table to get colum names for future column
# updates
# cursor = cs.execute(f"SELECT * from PORTAL_RAW.{portal_name} limit 5")
# df = pd.DataFrame(cursor.description)
# If the table is empty, auto create it, otherwise, truncake and overwrite
# The rationale for this is that some tables have "grant" and "group" as
# and those are reserved column headers.
Expand All @@ -59,6 +64,8 @@
table_type="transient",
overwrite=True
)

# TODO account for schema changes
# Upsert into non-temporary tables
update_set = [f'"{portal_name}"."{col}" = "{target_table}"."{col}"' for col in portal_df.columns]
update_set_str = ",".join(update_set)
Expand All @@ -80,3 +87,8 @@
query = f"select * from {portal_name} limit 10;"
cs.execute(query)
opt = cs.fetch_pandas_all()

# ! One time port of HTAN
# htan_ent = syn.get("syn52677746")
# htan_df = pd.read_csv(htan_ent.path)
# write_pandas(ctx, htan_df, "HTAN", auto_create_table=True)

0 comments on commit b8ad52c

Please sign in to comment.