Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

temp fixes for malformed csv #8

Merged
merged 1 commit into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ This is an data-stack-in-a-box based data from [NSW Education Data Hub](https://
> [!IMPORTANT]
> Click below 👇🏼 to setup your own free data stack packed with [NSW Department of Education](https://education.nsw.gov.au/) data.

> [!WARNING]
> Only 100 Schools will appear. Currently have a bug where master data set is malformed and cant get all shcools
> to remove this filter when fixed `df = df.head(100)` in master data asset

[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/wisemuffin/nsw-doe-data-stack-in-a-box?quickstart=1)


Expand Down
12 changes: 6 additions & 6 deletions orchestration/pipeline_nsw_doe/assets/raw/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
asset,
)

from pipeline_nsw_doe.factory import pandera_schema_to_dagster_type
from dagster_pandera import pandera_schema_to_dagster_type
# from pipeline_nsw_doe.factory import pandera_schema_to_dagster_type

from .schema_masterdataset import schema as schema_masterdataset
from .schema_ram import schema as schema_ram

DatahubMasterDatasetDagsterType = pandera_schema_to_dagster_type(
schema=schema_masterdataset,
name="DatahubMasterDatasetDagsterType",
description="data frame DagsterType type for this dummy asset.",
schema=schema_masterdataset
)

NSW_DOE_DATA_STACK_IN_A_BOX_TARGET_SCHEMA: str = os.getenv(
Expand Down Expand Up @@ -47,6 +46,7 @@ def raw__nsw_doe_datahub__master_dataset():
url = "https://data.cese.nsw.gov.au/data/dataset/027493b2-33ad-3f5b-8ed9-37cdca2b8650/resource/2ac19870-44f6-443d-a0c3-4c867f04c305/download/master_dataset.csv"
df = pd.read_csv(
url,
on_bad_lines="skip", # 🚧 TODO Temp workaround due to malformed csv
)

df["_load_timestamp"] = pd.Timestamp("now")
Expand All @@ -56,6 +56,8 @@ def raw__nsw_doe_datahub__master_dataset():
print(df.shape)
print(df.dtypes)

df = df.head(100) # 🚧 TODO - temp fix to skip errors with malformed csv

# schema = pa.infer_schema(df)
# schema_script = schema.to_script('schema_template.py')
# print(schema_script)
Expand All @@ -69,8 +71,6 @@ def raw__nsw_doe_datahub__master_dataset():

DatahubRamDagsterType = pandera_schema_to_dagster_type(
schema=schema_ram,
name="DatahubRamDagsterType",
description="data frame DagsterType type for this dummy asset.",
)


Expand Down
11 changes: 0 additions & 11 deletions orchestration/pipeline_nsw_doe/assets/raw/schema_masterdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,17 +486,6 @@
description=None,
title=None,
),
"Healthy canteen": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=True,
regex=False,
description=None,
title=None,
),
"FOEI_Value": Column(
dtype="object",
checks=None,
Expand Down
33 changes: 0 additions & 33 deletions orchestration/pipeline_nsw_doe/factory.py

This file was deleted.

1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dagster-cloud
# data schema validation
pandera
pandera[io]
dagster-pandera

# required for pandas to read excel for acara data
openpyxl
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ dagster==1.7.6
# dagster-graphql
# dagster-msteams
# dagster-openai
# dagster-pandera
# dagster-webserver
# dagstermill
dagster-cloud==1.7.6
Expand All @@ -135,6 +136,7 @@ dagster-graphql==1.7.6
# via dagster-webserver
dagster-msteams==0.23.6
dagster-openai==0.23.6
dagster-pandera==0.23.6
dagster-pipes==1.7.6
# via dagster
dagster-webserver==1.7.6
Expand Down Expand Up @@ -548,6 +550,7 @@ pandas==1.5.3
# via
# cmdstanpy
# dagster-duckdb-pandas
# dagster-pandera
# metricflow
# pandera
# phik
Expand All @@ -558,6 +561,7 @@ pandas==1.5.3
# visions
# ydata-profiling
pandera==0.19.3
# via dagster-pandera
pandocfilters==1.5.1
# via nbconvert
papermill==2.6.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ final as (
--Foreign Keys
----Conformed Dimensions
{{ get_keyed_nulls('dim__school._meta__dim__school__sk') }} as _meta__dim__school__sk,
prep__resource_allocation.year || '-01-01' as _meta__dim__date__sk, -- dont love this. 🚧 TODO - if only one date in fact this works...also doesnt force
prep__resource_allocation.year || '-01-01' as _meta__dim__date__sk, -- dont love this. 🚧 TODO - if only one date in fact this works...also doesnt force

----Local Dimensions

Expand All @@ -37,7 +37,7 @@ final as (
from prep__resource_allocation
left join
dim__school
on prep__resource_allocation.school_code = dim__school.school_code
on cast(prep__resource_allocation.school_code as varchar) = cast(dim__school.school_code as varchar)
{# left join dim__date on prep__resource_allocation.year || '-01-01' = dim__date. #}
)

Expand Down
Loading