Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ETL-289] Remove deleted and duplicated samples #41

Merged
merged 2 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 63 additions & 3 deletions src/glue/jobs/json_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
"fitbitsleeplogs": ["LogId"],
"healthkitv2characteristics": ["HealthKitCharacteristicKey"],
"healthkitv2samples": ["HealthKitSampleKey"],
"healthkitv2samples_deleted": ["HealthKitSampleKey"],
"healthkitv2heartbeat": ["HealthKitHeartbeatSampleKey"],
"healthkitv2statistics": ["HealthKitStatisticKey"],
"healthkitv2clinicalrecords": ["HealthKitClinicalRecordKey"],
Expand Down Expand Up @@ -101,7 +100,8 @@ def get_table(
after the duplicates have been dropped by descending
export date.
"""
_, table_data_type = table_name.split("_")
table_name_components = table_name.split("_")
table_data_type = table_name_components[1]
table = (
glue_context.create_dynamic_frame.from_catalog(
database=database_name,
Expand All @@ -128,12 +128,66 @@ def get_table(
)
return table

def drop_deleted_healthkit_data(
glue_context: GlueContext,
table: DynamicFrame,
glue_database: str
) -> DynamicFrame:
"""
Drop records from a HealthKit table.

This function will attempt to fetch the respective *_deleted table
for a table containing HealthKit data. If no *_deleted table is found,
then the HealthKit table is return unmodified. Otherwise, a diff is taken
upon the HealthKit table, using the index field specified in
`INDEX_FIELD_MAP` as reference.

Args:
glue_context (GlueContext): The glue context
table (DynamicFrame): A DynamicFrame containing HealthKit data
glue_database (str): The name of the Glue database containing
the *_deleted table

Returns:
DynamicFrame: A DynamicFrame with the respective *_deleted table's
samples removed.
"""
glue_client = boto3.client("glue")
deleted_table_name = f"{table.name}_deleted"
table_data_type = table.name.split("_")[1]
try:
glue_client.get_table(
DatabaseName=glue_database,
Name=deleted_table_name
)
except glue_client.exceptions.EntityNotFoundException:
return table
deleted_table = get_table(
table_name=deleted_table_name,
database_name=glue_database,
glue_context=glue_context
)
table_df = table.toDF()
deleted_table_df = deleted_table.toDF()
table_with_deleted_samples_removed = DynamicFrame.fromDF(
dataframe=(
table_df.join(
other=deleted_table_df,
on=INDEX_FIELD_MAP[table_data_type],
how="left_anti"
)
),
glue_ctx=glue_context,
name=table.name
)
return table_with_deleted_samples_removed

def write_table_to_s3(
dynamic_frame: DynamicFrame,
bucket: str,
key: str,
glue_context: GlueContext,
records_per_partition: int = 1e6
records_per_partition: int = int(1e6)
) -> None:
"""
Write a DynamicFrame to S3 as a parquet dataset.
Expand Down Expand Up @@ -269,6 +323,12 @@ def main() -> None:
glue_context=glue_context
)
table_schema = table.schema()
if "healthkit" in table_name:
table = drop_deleted_healthkit_data(
glue_context=glue_context,
table=table,
glue_database=workflow_run_properties["glue_database"]
)

# Export new table records to parquet
if has_nested_fields(table_schema) and table.count() > 0:
Expand Down
8 changes: 4 additions & 4 deletions templates/glue-job-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Resources:
- iam:GetRolePolicy
Resource:
- "*"
- PolicyName: ReadWriteInternalS3
- PolicyName: ReadWriteS3
PolicyDocument:
Version: '2012-10-17'
Statement:
Expand All @@ -65,9 +65,7 @@ Resources:
- !Sub arn:aws:s3:::${S3IntermediateBucketName}/*
- !Sub arn:aws:s3:::${S3ParquetBucketName}
- !Sub arn:aws:s3:::${S3ParquetBucketName}/*
- !Sub arn:aws:s3:::${S3ArtifactBucketName}
- !Sub arn:aws:s3:::${S3ArtifactBucketName}/*
- PolicyName: ReadExternalS3
- PolicyName: ReadS3
PolicyDocument:
Version: '2012-10-17'
Statement:
Expand All @@ -78,6 +76,8 @@ Resources:
Resource:
- !Sub arn:aws:s3:::${S3SourceBucketName}
- !Sub arn:aws:s3:::${S3SourceBucketName}/*
- !Sub arn:aws:s3:::${S3ArtifactBucketName}
- !Sub arn:aws:s3:::${S3ArtifactBucketName}/*
- PolicyName: EC2
PolicyDocument:
Version: '2012-10-17'
Expand Down
2 changes: 1 addition & 1 deletion templates/glue-workflow.j2
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Resources:
Properties:
Actions:
{% for dataset in datasets %}
- JobName: {{ dataset["stackname_prefix"]}}-Job
- JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
{% endfor %}
Description: This trigger runs after completion of the S3 to JSON job.
Expand Down