wisemuffin · wisemuffin · Jun 7, 2024 · Jun 7, 2024
diff --git a/README.md b/README.md
@@ -15,6 +15,10 @@ This is an data-stack-in-a-box based data from [NSW Education Data Hub](https://
 > [!IMPORTANT]
 > Click below 👇🏼 to setup your own free data stack packed with [NSW Department of Education](https://education.nsw.gov.au/) data.
 
+> [!WARNING]
+> Only 100 Schools will appear. Currently have a bug where master data set is malformed and cant get all shcools
+> to remove this filter when fixed `df = df.head(100)` in master data asset
+
 [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/wisemuffin/nsw-doe-data-stack-in-a-box?quickstart=1)
 
 

diff --git a/orchestration/pipeline_nsw_doe/assets/raw/assets.py b/orchestration/pipeline_nsw_doe/assets/raw/assets.py
@@ -9,15 +9,14 @@
     asset,
 )
 
-from pipeline_nsw_doe.factory import pandera_schema_to_dagster_type
+from dagster_pandera import pandera_schema_to_dagster_type
+# from pipeline_nsw_doe.factory import pandera_schema_to_dagster_type
 
 from .schema_masterdataset import schema as schema_masterdataset
 from .schema_ram import schema as schema_ram
 
 DatahubMasterDatasetDagsterType = pandera_schema_to_dagster_type(
-    schema=schema_masterdataset,
-    name="DatahubMasterDatasetDagsterType",
-    description="data frame DagsterType type for this dummy asset.",
+    schema=schema_masterdataset
 )
 
 NSW_DOE_DATA_STACK_IN_A_BOX_TARGET_SCHEMA: str = os.getenv(
@@ -47,6 +46,7 @@ def raw__nsw_doe_datahub__master_dataset():
     url = "https://data.cese.nsw.gov.au/data/dataset/027493b2-33ad-3f5b-8ed9-37cdca2b8650/resource/2ac19870-44f6-443d-a0c3-4c867f04c305/download/master_dataset.csv"
     df = pd.read_csv(
         url,
+        on_bad_lines="skip",  # 🚧 TODO Temp workaround due to malformed csv
     )
 
     df["_load_timestamp"] = pd.Timestamp("now")
@@ -56,6 +56,8 @@ def raw__nsw_doe_datahub__master_dataset():
     print(df.shape)
     print(df.dtypes)
 
+    df = df.head(100)  # 🚧 TODO - temp fix to skip errors with malformed csv
+
     # schema = pa.infer_schema(df)
     # schema_script = schema.to_script('schema_template.py')
     # print(schema_script)
@@ -69,8 +71,6 @@ def raw__nsw_doe_datahub__master_dataset():
 
 DatahubRamDagsterType = pandera_schema_to_dagster_type(
     schema=schema_ram,
-    name="DatahubRamDagsterType",
-    description="data frame DagsterType type for this dummy asset.",
 )
 
 

diff --git a/orchestration/pipeline_nsw_doe/assets/raw/schema_masterdataset.py b/orchestration/pipeline_nsw_doe/assets/raw/schema_masterdataset.py
@@ -486,17 +486,6 @@
             description=None,
             title=None,
         ),
-        "Healthy canteen": Column(
-            dtype="object",
-            checks=None,
-            nullable=True,
-            unique=False,
-            coerce=False,
-            required=True,
-            regex=False,
-            description=None,
-            title=None,
-        ),
         "FOEI_Value": Column(
             dtype="object",
             checks=None,

diff --git a/orchestration/pipeline_nsw_doe/factory.py b/orchestration/pipeline_nsw_doe/factory.py
diff --git a/requirements.in b/requirements.in
@@ -17,6 +17,7 @@ dagster-cloud
 # data schema validation
 pandera
 pandera[io]
+dagster-pandera
 
 # required for pandas to read excel for acara data
 openpyxl

diff --git a/requirements.txt b/requirements.txt
@@ -121,6 +121,7 @@ dagster==1.7.6
     #   dagster-graphql
     #   dagster-msteams
     #   dagster-openai
+    #   dagster-pandera
     #   dagster-webserver
     #   dagstermill
 dagster-cloud==1.7.6
@@ -135,6 +136,7 @@ dagster-graphql==1.7.6
     # via dagster-webserver
 dagster-msteams==0.23.6
 dagster-openai==0.23.6
+dagster-pandera==0.23.6
 dagster-pipes==1.7.6
     # via dagster
 dagster-webserver==1.7.6
@@ -548,6 +550,7 @@ pandas==1.5.3
     # via
     #   cmdstanpy
     #   dagster-duckdb-pandas
+    #   dagster-pandera
     #   metricflow
     #   pandera
     #   phik
@@ -558,6 +561,7 @@ pandas==1.5.3
     #   visions
     #   ydata-profiling
 pandera==0.19.3
+    # via dagster-pandera
 pandocfilters==1.5.1
     # via nbconvert
 papermill==2.6.0

diff --git a/transformation/transformation_nsw_doe/models/dimensional/facts/fct__resource_allocation.sql b/transformation/transformation_nsw_doe/models/dimensional/facts/fct__resource_allocation.sql
@@ -19,7 +19,7 @@ final as (
         --Foreign Keys
         ----Conformed Dimensions
         {{ get_keyed_nulls('dim__school._meta__dim__school__sk') }} as _meta__dim__school__sk,
-        prep__resource_allocation.year || '-01-01' as _meta__dim__date__sk, -- dont love this. 🚧 TODO - if only one date in fact this works...also doesnt force 
+        prep__resource_allocation.year || '-01-01' as _meta__dim__date__sk, -- dont love this. 🚧 TODO - if only one date in fact this works...also doesnt force
 
         ----Local Dimensions
 
@@ -37,7 +37,7 @@ final as (
     from prep__resource_allocation
     left join
         dim__school
-        on prep__resource_allocation.school_code = dim__school.school_code
+        on cast(prep__resource_allocation.school_code as varchar) = cast(dim__school.school_code as varchar)
     {# left join dim__date on prep__resource_allocation.year || '-01-01' = dim__date. #}
 )