From 33b12551d75a47d0db4f446115850c23adaee655 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Fri, 14 Apr 2023 10:05:17 -0700 Subject: [PATCH 01/18] add initial compare parquet datasets pythob job code, and tests --- Pipfile | 2 + src/glue/jobs/compare_parquet_datasets.py | 360 ++++++++++++++++++++++ tests/conftest.py | 227 ++++++++++++++ tests/test_compare_parquet_datasets.py | 355 +++++++++++++++++++++ 4 files changed, 944 insertions(+) create mode 100644 src/glue/jobs/compare_parquet_datasets.py create mode 100644 tests/conftest.py create mode 100644 tests/test_compare_parquet_datasets.py diff --git a/Pipfile b/Pipfile index 627ee0eb..3416a50b 100644 --- a/Pipfile +++ b/Pipfile @@ -13,3 +13,5 @@ pre-commit = "*" sceptre = ">=3.2.0" sceptre-sam-handler = "*" synapseclient = "~=2.7" +pandas = "*" +moto = "*" diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py new file mode 100644 index 00000000..e4f42ced --- /dev/null +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -0,0 +1,360 @@ +import os +import logging +import argparse + +import boto3 +import pandas as pd +import synapseclient +from pyarrow import fs +import pyarrow.parquet as pq + + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +INDEX_FIELD_MAP = { + "dataset_enrolledparticipants": ["ParticipantIdentifier"], + "dataset_fitbitprofiles": ["ParticipantIdentifier", "ModifiedDate"], + "dataset_fitbitdevices": ["ParticipantIdentifier", "Date"], + "dataset_fitbitactivitylogs": ["LogId"], + "dataset_fitbitdailydata": ["ParticipantIdentifier", "Date"], + "dataset_fitbitintradaycombined": ["ParticipantIdentifier", "Type", "DateTime"], + "dataset_fitbitrestingheartrates": ["ParticipantIdentifier", "Date"], + "dataset_fitbitsleeplogs": ["LogId"], + "dataset_healthkitv2characteristics": ["HealthKitCharacteristicKey"], + "dataset_healthkitv2samples": ["HealthKitSampleKey"], + "dataset_healthkitv2samples_deleted": ["HealthKitSampleKey"], + "dataset_healthkitv2heartbeat": ["HealthKitHeartbeatSampleKey"], + "dataset_healthkitv2statistics": ["HealthKitStatisticKey"], + "dataset_healthkitv2clinicalrecords": ["HealthKitClinicalRecordKey"], + "dataset_healthkitv2electrocardiogram": ["HealthKitECGSampleKey"], + "dataset_healthkitv2workouts": ["HealthKitWorkoutKey"], + "dataset_healthkitv2activitysummaries": ["HealthKitActivitySummaryKey"], + "dataset_googlefitsamples": ["GoogleFitSampleKey"], + "dataset_symptomlog": ["DataPointKey"], +} + + +def read_args(): + parser = argparse.ArgumentParser( + description=( + "Compare parquet datasets between two namespaced S3 bucket locations" + ) + ) + parser.add_argument( + "--staging-namespace", + required=True, + help="The name of the staging namespace to use", + ) + parser.add_argument( + "--main-namespace", + required=True, + help=("The name of the main namespace to use"), + ) + parser.add_argument( + "--parquet-bucket", + required=True, + help=("The name of the S3 bucket containing the S3 files to compare"), + ) + args = parser.parse_args() + return args + + +def get_duplicated_index_fields(data_type: str, dataset: pd.DataFrame) -> pd.DataFrame: + index_cols = INDEX_FIELD_MAP[data_type] + return dataset[dataset.duplicated(subset=index_cols)] + + +def get_duplicated_columns(dataset: pd.DataFrame) -> list: + return dataset.columns[dataset.columns.duplicated()].tolist() + + +def get_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: + common_cols = staging_dataset.columns.intersection(main_dataset.columns).tolist() + return common_cols + + +def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: + missing_cols = main_dataset.columns.difference(staging_dataset.columns).tolist() + return missing_cols + + +def get_additional_cols( + staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + add_cols = staging_dataset.columns.difference(main_dataset.columns).tolist() + return add_cols + + +def get_S3FileSystem_from_session( + aws_session: boto3.session.Session, +) -> fs.S3FileSystem: + session_credentials = aws_session.get_credentials() + s3_fs = fs.S3FileSystem( + access_key=session_credentials.access_key, + secret_key=session_credentials.secret_key, + session_token=session_credentials.token, + ) + return s3_fs + + +def get_parquet_dataset( + dataset_key: str, s3_filesystem: fs.S3FileSystem +) -> pd.DataFrame: + """ + Returns a Parquet dataset on S3 as a pandas dataframe + + Args: + dataset_key (str): The URI of the parquet dataset. + s3_filesystem (S3FileSystem): A fs.S3FileSystem object + + Returns: + pandas.DataFrame + """ + table_source = dataset_key.split("s3://")[-1] + parquet_dataset = pq.read_table(source=table_source, filesystem=s3_filesystem) + return parquet_dataset.to_pandas() + + +def get_folders_in_s3_bucket( + s3: boto3.client, bucket_name: str, namespace: str +) -> list: + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{namespace}/parquet/") + if "Contents" in response.keys(): + contents = response["Contents"] + folders = [ + content["Key"].split("/")[-1] + for content in contents + if content["Key"].split("/")[-1] != "owner.txt" + ] + else: + folders = [] + return folders + + +def keep_common_rows_cols( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> dict: + index_cols = INDEX_FIELD_MAP[data_type] + common_cols = get_common_cols(staging_dataset, main_dataset) + # convert to having same columns + staging_dataset_subset = staging_dataset[common_cols].add_suffix("_staging") + main_dataset_subset = main_dataset[common_cols].add_suffix("_main") + + # merging on index to get rid of extra rows + merged_dataset = staging_dataset_subset.merge( + main_dataset_subset, + left_on=[f"{col}_staging" for col in index_cols], + right_on=[f"{col}_main" for col in index_cols], + how="inner", + ) + staging_dataset_common = merged_dataset[staging_dataset_subset.columns] + main_dataset_common = merged_dataset[main_dataset_subset.columns] + + staging_dataset_common.columns = staging_dataset_common.columns.str.removesuffix( + "_staging" + ) + main_dataset_common.columns = main_dataset_common.columns.str.removesuffix("_main") + return {"staging": staging_dataset_common, "main": main_dataset_common} + + +def compare_column_data_types( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + compare_msg = [] + common_cols = get_common_cols(staging_dataset, main_dataset) + for common_col in common_cols: + if staging_dataset[common_col].dtype != main_dataset[common_col].dtype: + compare_msg.append( + ( + f"{data_type}: Staging dataset's {common_col} has data type {staging_dataset[common_col].dtype}.\n" + f"Main dataset's {common_col} has data type {staging_dataset[common_col].dtype}." + ) + ) + return compare_msg + + +def compare_column_names( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + compare_msg = [] + missing_cols = get_missing_cols(staging_dataset, main_dataset) + add_cols = get_additional_cols(staging_dataset, main_dataset) + if missing_cols: + compare_msg.append( + f"{data_type}: Staging dataset has the following missing columns:\n{str(missing_cols)}" + ) + if add_cols: + compare_msg.append( + f"{data_type}: Staging dataset has the following additional columns:\n{str(add_cols)}" + ) + return compare_msg + + +def compare_column_vals( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + compare_msg = [] + dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) + dataset_diff = dataset_dict["staging"].compare( + other=dataset_dict["main"], align_axis="columns", keep_shape=True + ) + dataset_diff_cnt = dataset_diff.isna().sum() + dataset_diff_cnt = dataset_diff_cnt[dataset_diff_cnt == 0].to_dict() + if dataset_diff_cnt: + compare_msg.append( + f"{data_type}: Staging dataset has column(s) with value differences with the main dataset:\n" + f"{str(list(dataset_diff_cnt.keys()))}" + ) + return compare_msg + + +def compare_dataset_data_types( + s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str +) -> list: + compare_msg = [] + staging_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=staging_namespace + ) + main_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=main_namespace + ) + missing_datatypes = list(set(main_datatype_folders) - set(staging_datatype_folders)) + add_datatypes = list(set(staging_datatype_folders) - set(main_datatype_folders)) + + if missing_datatypes: + compare_msg.append( + f"Staging dataset has the following missing data types: {str(missing_datatypes)}" + ) + + if add_datatypes: + compare_msg.append( + f"Staging dataset has the following additional data types: {str(add_datatypes)}" + ) + return compare_msg + + +def compare_num_of_rows( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + compare_msg = [] + if staging_dataset.shape[0] != main_dataset.shape[0]: + compare_msg.append( + f"{data_type}: Staging dataset has {staging_dataset.shape[0]} rows of data.\n" + f"Main dataset has {main_dataset.shape[0]} rows of data." + ) + return compare_msg + + +def compare_dataset_row_vals( + data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +) -> list: + compare_msg = [] + dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) + dataset_diff = dataset_dict["staging"].compare( + other=dataset_dict["main"], align_axis="columns", keep_equal=False + ) + if not dataset_diff.empty: + compare_msg.append( + f"{data_type}: Staging dataset has value difference(s) with the main dataset." + f"Here is an example:\n{dataset_diff.head(1)}" + ) + return compare_msg + + +def get_data_types_to_compare( + s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str +) -> list: + staging_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=staging_namespace + ) + main_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=main_namespace + ) + return list(set(staging_datatype_folders + main_datatype_folders)) + + +def print_comparison_result(comparison_result: dict) -> None: + for msg in comparison_result: + logger.info(comparison_result[msg]) + logger.info("Comparison results complete!") + + +def compare_datasets_by_data_type( + args, + s3_filesystem: fs.S3FileSystem, + data_type: str, + comparison_result: dict, +) -> dict: + data_type = "dataset_fitbitactivitylogs" + staging_dataset = get_parquet_dataset( + dataset_key=f"s3://{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/", + s3_filesystem=s3_filesystem, + ) + + main_dataset = get_parquet_dataset( + dataset_key=f"s3://{args.parquet_bucket}/{args.main_namespace}/parquet/{data_type}/", + s3_filesystem=s3_filesystem, + ) + + if staging_dataset.empty or main_dataset.empty: + comparison_result[ + data_type + ] = f"One of {args.staging_namespace} or {args.main_namespace} has no data. Comparison cannot continue." + else: + # check that the dataset has no dup cols or dup rows and that they have cols in common + comparison_result[data_type] = [] + # check if one or both of the datasets have no data + if staging_dataset.empty or main_dataset.empty: + comparison_result["empty"][ + data_type + ] = f"One of {args.staging_namespace} or {args.main_namespace} has no data. Comparison cannot continue." + else: + comparison_result[data_type].append( + compare_column_data_types(data_type, staging_dataset, main_dataset) + ) + comparison_result[data_type].append( + compare_column_names(data_type, staging_dataset, main_dataset) + ) + comparison_result[data_type].append( + compare_column_vals(data_type, staging_dataset, main_dataset) + ) + comparison_result[data_type].append( + compare_num_of_rows(data_type, staging_dataset, main_dataset) + ) + comparison_result[data_type].append( + compare_dataset_row_vals(data_type, staging_dataset, main_dataset) + ) + return comparison_result + + +def main(): + args = read_args() + comparison_result = {} + s3 = boto3.client("s3") + aws_session = boto3.session.Session(profile_name="default", region_name="us-east-1") + fs = get_S3FileSystem_from_session(aws_session) + + # check if one or both of the datasets have no data + comparison_result["missing_data_types"] = compare_dataset_data_types( + s3, + args.parquet_bucket, + main_namespace=args.main_namespace, + staging_namespace=args.staging_namespace, + ) + data_types_to_compare = get_data_types_to_compare( + s3, + args.parquet_bucket, + main_namespace=args.main_namespace, + staging_namespace=args.staging_namespace, + ) + for data_type in data_types_to_compare: + comparison_result = compare_datasets_by_data_type( + args, fs, data_type, comparison_result + ) + print_comparison_result(comparison_result) + return + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..faa4ac4b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,227 @@ +import os +from unittest import mock + +import boto3 +import pytest +import pandas as pd +from pyarrow import fs +from moto import mock_s3 + + +@pytest.fixture(scope="function") +def mock_aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + + +@pytest.fixture(scope="function") +def s3(mock_aws_credentials): + with mock_s3(): + yield boto3.client("s3", region_name="us-east-1") + + +@pytest.fixture() +def parquet_bucket_name(): + yield f"test-parquet-bucket" + + +@pytest.fixture +def s3_test_bucket(s3, parquet_bucket_name): + with mock_s3(): + s3.create_bucket(Bucket=parquet_bucket_name) + yield + + +@pytest.fixture(scope="function") +def mock_s3_filesystem(mock_aws_credentials): + with mock_s3(): + yield fs.S3FileSystem(region="us-east-1") + + +@pytest.fixture() +def valid_staging_dataset(): + yield pd.DataFrame( + { + "LogId": [ + "44984262767", + "46096730542", + "51739302864", + ], + "StartDate": [ + "2021-12-24T14:27:39+00:00", + "2022-02-18T08:26:54+00:00", + "2022-10-28T11:58:50+00:00", + ], + "EndDate": [ + "2021-12-24T14:40:27+00:00", + "2022-02-18T09:04:30+00:00", + "2022-10-28T12:35:38+00:00", + ], + "ActiveDuration": ["768000", "2256000", "2208000"], + "Calories": ["89", "473", "478"], + } + ) + + +@pytest.fixture() +def valid_main_dataset(): + yield pd.DataFrame( + { + "LogId": [ + "44984262767", + "46096730542", + "51739302864", + ], + "StartDate": [ + "2021-12-24T14:27:39+00:00", + "2022-02-18T08:26:54+00:00", + "2022-10-28T11:58:50+00:00", + ], + "EndDate": [ + "2021-12-24T14:40:27+00:00", + "2022-02-18T09:04:30+00:00", + "2022-10-28T12:35:38+00:00", + ], + "ActiveDuration": ["768000", "2256000", "2208000"], + "Calories": ["89", "473", "478"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_missing_cols(): + yield pd.DataFrame( + { + "LogId": [ + "44984262767", + "46096730542", + "51739302864", + ], + "ActiveDuration": ["768000", "2256000", "2208000"], + "Calories": ["89", "473", "478"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_add_cols(): + yield pd.DataFrame( + { + "LogId": [ + "44984262767", + "46096730542", + "51739302864", + ], + "StartDate": [ + "2021-12-24T14:27:39+00:00", + "2022-02-18T08:26:54+00:00", + "2022-10-28T11:58:50+00:00", + ], + "EndDate": [ + "2021-12-24T14:40:27+00:00", + "2022-02-18T09:04:30+00:00", + "2022-10-28T12:35:38+00:00", + ], + "ActiveDuration": ["768000", "2256000", "2208000"], + "Calories": ["89", "473", "478"], + "AverageHeartRate": ["108", "151", "157"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_no_common_cols(): + yield pd.DataFrame( + { + "ParticipantIdentifier": [ + "MDH-9352-3209", + "MDH-9352-3209", + "MDH-9352-3209", + ], + "Steps": ["866", "6074", "5744"], + "OriginalDuration": ["768000", "2256000", "2208000"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_diff_data_type_cols(): + yield pd.DataFrame( + { + "LogId": [ + "44984262767", + "46096730542", + "51739302864", + ], + "StartDate": [ + "2021-12-24T14:27:39+00:00", + "2022-02-18T08:26:54+00:00", + "2022-10-28T11:58:50+00:00", + ], + "EndDate": [ + "2021-12-24T14:40:27+00:00", + "2022-02-18T09:04:30+00:00", + "2022-10-28T12:35:38+00:00", + ], + "ActiveDuration": [768000, 2256000, 2208000], + "Calories": [89.0, 473.0, 478.0], + } + ) + + +@pytest.fixture() +def staging_dataset_with_diff_num_of_rows(): + yield pd.DataFrame( + { + "LogId": ["44984262767"], + "StartDate": ["2021-12-24T14:27:39+00:00"], + "EndDate": ["2021-12-24T14:40:27+00:00"], + "ActiveDuration": ["768000"], + "Calories": ["89"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_dup_cols(): + dup_df = pd.DataFrame( + { + "LogId": ["44984262767"], + "StartDate": ["2021-12-24T14:27:39+00:00"], + "EndDate": ["2021-12-24T14:40:27+00:00"], + "ActiveDuration": [768000], + "Calories": [89.0], + } + ) + dup_df = dup_df.rename({"StartDate": "EndDate"}, axis=1) + yield dup_df + + +@pytest.fixture() +def staging_dataset_with_dup_indexes(): + yield pd.DataFrame( + { + "LogId": ["44984262767", "44984262767"], + "StartDate": ["2021-12-24T14:27:39+00:00", "2021-12-24T14:27:39+00:00"], + "EndDate": ["2021-12-24T14:40:27+00:00", "2021-12-24T14:40:27+00:00"], + } + ) + + +@pytest.fixture() +def staging_dataset_with_all_col_val_diff(): + yield pd.DataFrame( + { + "LogId": ["44984262767", "44984262767"], + "StartDate": ["2021-12-24T14:27:39+00:00", "2021-12-24T14:27:39+00:00"], + "EndDate": ["TESTING1", "TESTING2"], + } + ) + +@pytest.fixture() +def empty_dataset(): + pass diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py new file mode 100644 index 00000000..408cd9cf --- /dev/null +++ b/tests/test_compare_parquet_datasets.py @@ -0,0 +1,355 @@ +from unittest import mock + +import pandas as pd +from pyarrow import fs +from moto import mock_s3 +from pandas.testing import assert_frame_equal + +from glue.jobs import compare_parquet_datasets as compare_parquet + + +def test_that_get_duplicated_index_fields_returns_empty_df_if_no_dup_exist( + valid_staging_dataset, +): + assert ( + compare_parquet.get_duplicated_index_fields( + "dataset_fitbitactivitylogs", valid_staging_dataset + ).empty + == True + ) + + +def test_that_get_duplicated_index_fields_returns_dup_df_if_dup_exist( + staging_dataset_with_dup_indexes, +): + assert_frame_equal( + compare_parquet.get_duplicated_index_fields( + "dataset_fitbitactivitylogs", staging_dataset_with_dup_indexes + ).reset_index(drop=True), + pd.DataFrame( + { + "LogId": ["44984262767"], + "StartDate": ["2021-12-24T14:27:39+00:00"], + "EndDate": ["2021-12-24T14:40:27+00:00"], + } + ).reset_index(drop=True), + ) + + +def test_that_get_duplicated_columns_returns_empty_if_no_dup_exist( + valid_staging_dataset, +): + assert compare_parquet.get_duplicated_columns(valid_staging_dataset) == [] + + +def test_that_get_duplicated_columns_returns_list_if_dup_exist( + staging_dataset_with_dup_cols, +): + assert compare_parquet.get_duplicated_columns(staging_dataset_with_dup_cols) == [ + "EndDate" + ] + + +@mock_s3 +def test_that_get_parquet_dataset_returns_empty_if_no_datasets_exist( + s3, mock_s3_filesystem, valid_staging_dataset, parquet_bucket_name +): + data = valid_staging_dataset.to_parquet() + s3.create_bucket(Bucket=parquet_bucket_name) + s3.put_object( + Bucket=parquet_bucket_name, + Key="staging/parquet/dataset_fitbitactivitylogs/test.parquet", + Body=data, + ) + + file_key = "staging/parquet/dataset_fitbitactivitylogs/test.parquet" + parquet_dataset = compare_parquet.get_parquet_dataset( + dataset_key=f"{parquet_bucket_name}/{file_key}", + s3_filesystem=mock_s3_filesystem, + ) + assert parquet_dataset == None + + +@mock_s3 +def test_that_get_parquet_dataset_returns_dataset_if_datasets_exist( + s3, mock_s3_filesystem, valid_staging_dataset, parquet_bucket_name +): + pass + + +@mock_s3 +def test_that_get_folders_in_s3_bucket_returns_empty_list_if_no_folders( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + result = compare_parquet.get_folders_in_s3_bucket( + s3, bucket_name=parquet_bucket_name, namespace="staging" + ) + assert result == [] + + +@mock_s3 +def test_that_get_folders_in_s3_bucket_returns_list_if_folder_exists( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + s3.put_object( + Bucket=parquet_bucket_name, Key="staging/parquet/dataset_fitbitactivitylogs" + ) + result = compare_parquet.get_folders_in_s3_bucket( + s3, bucket_name=parquet_bucket_name, namespace="staging" + ) + assert result == ["dataset_fitbitactivitylogs"] + + +def test_that_keep_common_rows_cols_returns_same_df_when_both_df_are_the_same( + valid_staging_dataset, valid_main_dataset +): + datasets = compare_parquet.keep_common_rows_cols( + "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset + ) + assert_frame_equal(datasets["staging"], valid_staging_dataset) + assert_frame_equal(datasets["main"], valid_main_dataset) + + +def test_that_keep_common_rows_cols_returns_correct_df_when_staging_df_has_less_rows( + staging_dataset_with_diff_num_of_rows, valid_main_dataset +): + datasets = compare_parquet.keep_common_rows_cols( + "dataset_fitbitactivitylogs", + staging_dataset_with_diff_num_of_rows, + valid_main_dataset, + ) + assert_frame_equal(datasets["staging"], staging_dataset_with_diff_num_of_rows) + assert_frame_equal(datasets["main"], staging_dataset_with_diff_num_of_rows) + + +def test_that_keep_common_rows_cols_returns_correct_df_when_staging_df_has_more_col( + staging_dataset_with_add_cols, valid_main_dataset +): + datasets = compare_parquet.keep_common_rows_cols( + "dataset_fitbitactivitylogs", staging_dataset_with_add_cols, valid_main_dataset + ) + assert_frame_equal(datasets["staging"], valid_main_dataset) + assert_frame_equal(datasets["main"], valid_main_dataset) + + +def test_that_get_common_cols_returns_empty_list_if_no_common_cols( + staging_dataset_with_no_common_cols, valid_main_dataset +): + test_common_cols = compare_parquet.get_common_cols( + staging_dataset_with_no_common_cols, valid_main_dataset + ) + assert test_common_cols == [] + + +def test_that_get_common_cols_returns_list_of_cols_if_common_cols( + valid_staging_dataset, valid_main_dataset +): + test_common_cols = compare_parquet.get_common_cols( + valid_staging_dataset, valid_main_dataset + ) + assert test_common_cols == [ + "LogId", + "StartDate", + "EndDate", + "ActiveDuration", + "Calories", + ] + + +def test_that_get_missing_cols_returns_empty_list_if_no_missing_cols( + valid_staging_dataset, valid_main_dataset +): + test_missing_cols = compare_parquet.get_missing_cols( + valid_staging_dataset, valid_main_dataset + ) + assert test_missing_cols == [] + + +def test_that_get_missing_cols_returns_list_of_cols_if_missing_cols( + staging_dataset_with_missing_cols, valid_main_dataset +): + test_missing_cols = compare_parquet.get_missing_cols( + staging_dataset_with_missing_cols, valid_main_dataset + ) + assert test_missing_cols == ["EndDate", "StartDate"] + + +def test_that_get_additional_cols_returns_empty_list_if_no_add_cols( + valid_staging_dataset, valid_main_dataset +): + test_add_cols = compare_parquet.get_additional_cols( + valid_staging_dataset, valid_main_dataset + ) + assert test_add_cols == [] + + +def test_that_get_additional_cols_returns_list_of_cols_if_add_cols( + staging_dataset_with_add_cols, valid_main_dataset +): + test_add_cols = compare_parquet.get_additional_cols( + staging_dataset_with_add_cols, valid_main_dataset + ) + assert test_add_cols == ["AverageHeartRate"] + + +def test_that_compare_column_data_types_returns_empty_msg_if_no_common_cols( + staging_dataset_with_no_common_cols, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_data_types( + "dataset_fitbitactivitylogs", + staging_dataset_with_no_common_cols, + valid_main_dataset, + ) + assert compare_msg == [] + + +def test_that_compare_column_data_types_returns_msg_if_diff_data_types( + staging_dataset_with_diff_data_type_cols, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_data_types( + "dataset_fitbitactivitylogs", + staging_dataset_with_diff_data_type_cols, + valid_main_dataset, + ) + + assert compare_msg == [ + "dataset_fitbitactivitylogs: Staging dataset's ActiveDuration has data type int64.\n" + "Main dataset's ActiveDuration has data type object.", + "dataset_fitbitactivitylogs: Staging dataset's Calories has data type float64.\n" + "Main dataset's Calories has data type object.", + ] + + +def test_that_compare_column_names_returns_empty_msg_if_cols_are_same( + valid_staging_dataset, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_names( + "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset + ) + assert compare_msg == [] + + +def test_that_compare_column_names_returns_msg_if_cols_are_diff( + staging_dataset_with_no_common_cols, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_names( + "dataset_fitbitactivitylogs", + staging_dataset_with_no_common_cols, + valid_main_dataset, + ) + + assert compare_msg == [ + "dataset_fitbitactivitylogs: Staging dataset has the following missing columns:\n" + "['ActiveDuration', 'Calories', 'EndDate', 'LogId', 'StartDate']", + "dataset_fitbitactivitylogs: Staging dataset has the following additional columns:\n" + "['OriginalDuration', 'ParticipantIdentifier', 'Steps']", + ] + + +def test_that_compare_column_vals_returns_empty_msg_if_no_col_val_diff( + valid_staging_dataset, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_vals( + "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset + ) + assert compare_msg == [] + + +def test_that_compare_column_vals_returns_msg_if_all_col_val_are_diff( + staging_dataset_with_all_col_val_diff, valid_main_dataset +): + compare_msg = compare_parquet.compare_column_vals( + "dataset_fitbitactivitylogs", + staging_dataset_with_all_col_val_diff, + valid_main_dataset, + ) + assert compare_msg == [ + "dataset_fitbitactivitylogs: Staging dataset has column(s) with value differences with the main dataset:\n" + "[('EndDate', 'self'), ('EndDate', 'other')]" + ] + + +@mock_s3 +def test_that_compare_dataset_data_types_returns_empty_msg_if_datatypes_are_equal( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + for namespace in ["staging", "main"]: + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"{namespace}/parquet/dataset_fitbitactivitylogs", + ) + compare_msg = compare_parquet.compare_dataset_data_types( + s3, parquet_bucket_name, staging_namespace="staging", main_namespace="main" + ) + assert compare_msg == [] + + +@mock_s3 +def test_that_compare_dataset_data_types_returns_msg_if_datatypes_are_not_equal( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + for datatype in ["dataset_fitbitactivitylogs", "dataset_fitbitintradaycombined"]: + s3.put_object(Bucket=parquet_bucket_name, Key=f"staging/parquet/{datatype}") + + for datatype in ["dataset_fitbitactivitylogs", "dataset_fitbitdevices"]: + s3.put_object(Bucket=parquet_bucket_name, Key=f"main/parquet/{datatype}") + + compare_msg = compare_parquet.compare_dataset_data_types( + s3, parquet_bucket_name, staging_namespace="staging", main_namespace="main" + ) + assert compare_msg == [ + "Staging dataset has the following missing data types: ['dataset_fitbitdevices']", + "Staging dataset has the following additional data types: ['dataset_fitbitintradaycombined']", + ] + + +def test_that_compare_num_of_rows_returns_empty_msg_if_num_of_rows_are_equal( + valid_staging_dataset, valid_main_dataset +): + compare_msg = compare_parquet.compare_num_of_rows( + "dataset_fitbitactivitylogs", + valid_staging_dataset, + valid_main_dataset, + ) + assert compare_msg == [] + + +def test_that_compare_num_of_rows_returns_msg_if_num_of_rows_are_diff( + staging_dataset_with_diff_num_of_rows, valid_main_dataset +): + compare_msg = compare_parquet.compare_num_of_rows( + "dataset_fitbitactivitylogs", + staging_dataset_with_diff_num_of_rows, + valid_main_dataset, + ) + + assert compare_msg == [ + "dataset_fitbitactivitylogs: Staging dataset has 1 rows of data.\n" + "Main dataset has 3 rows of data." + ] + + +def test_that_compare_dataset_row_vals_returns_empty_msg_if_no_diff( + valid_staging_dataset, valid_main_dataset +): + compare_msg = compare_parquet.compare_dataset_row_vals( + "dataset_fitbitactivitylogs", + valid_staging_dataset, + valid_main_dataset, + ) + assert compare_msg == [] + + +def test_that_compare_dataset_row_vals_returns_msg_if_diff( + staging_dataset_with_all_col_val_diff, valid_main_dataset +): + compare_msg = compare_parquet.compare_dataset_row_vals( + "dataset_fitbitactivitylogs", + staging_dataset_with_all_col_val_diff, + valid_main_dataset, + ) + assert compare_msg != [] From 7e9ed35b0f49a43d0658e45269f53f0792a69203 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Fri, 14 Apr 2023 10:35:53 -0700 Subject: [PATCH 02/18] add docstring and comments --- src/glue/jobs/compare_parquet_datasets.py | 72 +++++++++++++++++++++-- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index e4f42ced..438e2d32 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -1,4 +1,5 @@ import os +import json import logging import argparse @@ -8,6 +9,7 @@ from pyarrow import fs import pyarrow.parquet as pq +# from json_to_parquet import INDEX_FIELD_MAP logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -61,20 +63,29 @@ def read_args(): def get_duplicated_index_fields(data_type: str, dataset: pd.DataFrame) -> pd.DataFrame: + """Gets the rows of data that are duplicated based on the index columns by data type + and returns them + """ index_cols = INDEX_FIELD_MAP[data_type] return dataset[dataset.duplicated(subset=index_cols)] def get_duplicated_columns(dataset: pd.DataFrame) -> list: + """ Gets a list of duplicated columns in a dataframe + """ return dataset.columns[dataset.columns.duplicated()].tolist() def get_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: + """ Gets the list of common columns between two dataframes + """ common_cols = staging_dataset.columns.intersection(main_dataset.columns).tolist() return common_cols def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: + """ Gets the list of missing columns present in main but not in staging + """ missing_cols = main_dataset.columns.difference(staging_dataset.columns).tolist() return missing_cols @@ -82,6 +93,8 @@ def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) def get_additional_cols( staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """ Gets the list of additional columns present in staging but not in main + """ add_cols = staging_dataset.columns.difference(main_dataset.columns).tolist() return add_cols @@ -89,6 +102,15 @@ def get_additional_cols( def get_S3FileSystem_from_session( aws_session: boto3.session.Session, ) -> fs.S3FileSystem: + """Gets a pyarrow S3 filesystem object given an + authenticated aws session with credentials + + Args: + aws_session (boto3.session.Session): authenticated aws session + + Returns: + fs.S3FileSystem: S3 filesystem object initiated from AWS credentials + """ session_credentials = aws_session.get_credentials() s3_fs = fs.S3FileSystem( access_key=session_credentials.access_key, @@ -119,6 +141,16 @@ def get_parquet_dataset( def get_folders_in_s3_bucket( s3: boto3.client, bucket_name: str, namespace: str ) -> list: + """Gets the folders in the S3 bucket under the specific namespace + + Args: + s3 (boto3.client): authenticated s3 client + bucket_name (str): name of the S3 bucket to look into + namespace (str): namespace of the path to look for folders in + + Returns: + list: folder names inside S3 bucket + """ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{namespace}/parquet/") if "Contents" in response.keys(): contents = response["Contents"] @@ -135,6 +167,19 @@ def get_folders_in_s3_bucket( def keep_common_rows_cols( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> dict: + """This function keeps the common columns between the two + given datasets. This function also merges on the index fields in + common between the two datasets so that the dataset can be + reduced to the same dimensions and be comparable + + Args: + data_type (str): current data type + staging_dataset (pd.DataFrame): "new" data that is to go through processing + main_dataset (pd.DataFrame): "established" dataset + + Returns: + dict of staging dataset and main datasets + """ index_cols = INDEX_FIELD_MAP[data_type] common_cols = get_common_cols(staging_dataset, main_dataset) # convert to having same columns @@ -161,6 +206,9 @@ def keep_common_rows_cols( def compare_column_data_types( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """ This compares the column data types of the common columns between + two datasets and creates a message if there are differences + """ compare_msg = [] common_cols = get_common_cols(staging_dataset, main_dataset) for common_col in common_cols: @@ -177,6 +225,8 @@ def compare_column_data_types( def compare_column_names( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """This compares the column names between two datasets and outputs a + message if there are any differences""" compare_msg = [] missing_cols = get_missing_cols(staging_dataset, main_dataset) add_cols = get_additional_cols(staging_dataset, main_dataset) @@ -194,6 +244,9 @@ def compare_column_names( def compare_column_vals( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """This compares the column values between the common columns of two + datasets after the datasets have been reduced to the same dimensions + and outputs a message if any columns have all of their values as different""" compare_msg = [] dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) dataset_diff = dataset_dict["staging"].compare( @@ -212,6 +265,9 @@ def compare_column_vals( def compare_dataset_data_types( s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str ) -> list: + """This looks at the current datatype folders in the S3 bucket between the + two namespaced paths and outputs a message if there are any differences + in the datatype folders""" compare_msg = [] staging_datatype_folders = get_folders_in_s3_bucket( s3, bucket_name, namespace=staging_namespace @@ -237,6 +293,8 @@ def compare_dataset_data_types( def compare_num_of_rows( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """This compares the number of rows between two datasets and outputs a message + if there are any row count differences""" compare_msg = [] if staging_dataset.shape[0] != main_dataset.shape[0]: compare_msg.append( @@ -249,6 +307,9 @@ def compare_num_of_rows( def compare_dataset_row_vals( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: + """This compares the row values between the two + datasets after the datasets have been reduced to the same dimensions + and outputs a message if any rows have differences""" compare_msg = [] dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) dataset_diff = dataset_dict["staging"].compare( @@ -265,6 +326,8 @@ def compare_dataset_row_vals( def get_data_types_to_compare( s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str ) -> list: + """This gets the common data types to run the comparison of the parquet datasets from + the two namespaced paths on based on the folders in the s3 bucket""" staging_datatype_folders = get_folders_in_s3_bucket( s3, bucket_name, namespace=staging_namespace ) @@ -275,8 +338,8 @@ def get_data_types_to_compare( def print_comparison_result(comparison_result: dict) -> None: - for msg in comparison_result: - logger.info(comparison_result[msg]) + """"This prints the comparison result dictionary into a nice format""" + logger.info(logger.warning(f"Comparison results: {json.dumps(comparison_result)}")) logger.info("Comparison results complete!") @@ -286,7 +349,7 @@ def compare_datasets_by_data_type( data_type: str, comparison_result: dict, ) -> dict: - data_type = "dataset_fitbitactivitylogs" + """This runs the bulk of the comparison functions from beginning to end by data type""" staging_dataset = get_parquet_dataset( dataset_key=f"s3://{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, @@ -296,7 +359,7 @@ def compare_datasets_by_data_type( dataset_key=f"s3://{args.parquet_bucket}/{args.main_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, ) - + # check if one or both of the datasets have no data if staging_dataset.empty or main_dataset.empty: comparison_result[ data_type @@ -335,7 +398,6 @@ def main(): aws_session = boto3.session.Session(profile_name="default", region_name="us-east-1") fs = get_S3FileSystem_from_session(aws_session) - # check if one or both of the datasets have no data comparison_result["missing_data_types"] = compare_dataset_data_types( s3, args.parquet_bucket, From 393cb199d8d978fef2c6a6f3a54d619371e5501c Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Fri, 14 Apr 2023 16:23:27 -0700 Subject: [PATCH 03/18] update code to use datacompy as main comparison tool --- Pipfile | 5 +- Pipfile.lock | 656 ++++++++++++++-------- src/glue/jobs/compare_parquet_datasets.py | 191 +++++-- tests/conftest.py | 14 +- tests/test_compare_parquet_datasets.py | 67 ++- 5 files changed, 640 insertions(+), 293 deletions(-) diff --git a/Pipfile b/Pipfile index 3416a50b..93675b86 100644 --- a/Pipfile +++ b/Pipfile @@ -13,5 +13,6 @@ pre-commit = "*" sceptre = ">=3.2.0" sceptre-sam-handler = "*" synapseclient = "~=2.7" -pandas = "*" -moto = "*" +pandas = "<1.5" +moto = "~=4.1" +datacompy = "~=0.8" diff --git a/Pipfile.lock b/Pipfile.lock index 497c97af..7e1a650a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "ace3ba1c990fec71547a4f855cbcbc74a7550ad007e064a32d72b483b570e9ef" + "sha256": "37dad20fcae1538630a2efa8b154ad4caf220b2a1ad870cd8a887e92adf9fd53" }, "pipfile-spec": 6, "requires": { - "python_version": "3.10" + "python_version": "3.9" }, "sources": [ { @@ -27,19 +27,19 @@ }, "boto3": { "hashes": [ - "sha256:bbb426a9b3afd3ccbac25e03b215d79e90b4c47905b1b08b3b9d86fc74096974", - "sha256:c92dd0fde7839c0ca9c16a989d67ceb7f80f53de19f2b087fd1182f2af41b2ae" + "sha256:b10324d452fe677d6e1005b650ae11158cf21310e0c0062b00d184b352f4196b", + "sha256:e32977256470b4de3a25a43acdf6c8e6375e762656ace292ad2ceba446203615" ], "markers": "python_version >= '3.7'", - "version": "==1.26.68" + "version": "==1.26.114" }, "botocore": { "hashes": [ - "sha256:08fa8302a22553e69b70b1de2cc8cec61a3a878546658d091473e13d5b9d2ca4", - "sha256:8f5cb96dc0862809d29fe512087c77c15fe6328a2d8238f0a96cccb6eb77ec12" + "sha256:aeacb03303d6babb0490247158647346d27900fcf89bfb9713db41f1ce1cc93f", + "sha256:dd61f445eb53fe906dbde86405915cce61ffa13824a1bb7b826bfc7869a3e628" ], "markers": "python_version >= '3.7'", - "version": "==1.29.68" + "version": "==1.29.114" }, "certifi": { "hashes": [ @@ -49,6 +49,75 @@ "markers": "python_version >= '3.6'", "version": "==2022.12.7" }, + "cffi": { + "hashes": [ + "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5", + "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef", + "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104", + "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426", + "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405", + "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375", + "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a", + "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e", + "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc", + "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf", + "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185", + "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497", + "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3", + "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35", + "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c", + "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83", + "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21", + "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca", + "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984", + "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac", + "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd", + "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee", + "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a", + "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2", + "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192", + "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7", + "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585", + "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f", + "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e", + "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27", + "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b", + "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e", + "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e", + "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d", + "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c", + "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415", + "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82", + "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02", + "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314", + "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325", + "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c", + "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3", + "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914", + "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045", + "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d", + "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9", + "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5", + "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2", + "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c", + "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3", + "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2", + "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8", + "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d", + "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d", + "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9", + "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162", + "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76", + "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4", + "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e", + "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9", + "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6", + "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b", + "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01", + "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0" + ], + "version": "==1.15.1" + }, "cfgv": { "hashes": [ "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426", @@ -66,96 +135,84 @@ }, "charset-normalizer": { "hashes": [ - "sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b", - "sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42", - "sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d", - "sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b", - "sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a", - "sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59", - "sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154", - "sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1", - "sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c", - "sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a", - "sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d", - "sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6", - "sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b", - "sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b", - "sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783", - "sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5", - "sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918", - "sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555", - "sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639", - "sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786", - "sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e", - "sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed", - "sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820", - "sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8", - "sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3", - "sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541", - "sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14", - "sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be", - "sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e", - "sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76", - "sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b", - "sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c", - "sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b", - "sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3", - "sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc", - "sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6", - "sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59", - "sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4", - "sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d", - "sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d", - "sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3", - "sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a", - "sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea", - "sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6", - "sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e", - "sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603", - "sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24", - "sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a", - "sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58", - "sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678", - "sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a", - "sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c", - "sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6", - "sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18", - "sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174", - "sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317", - "sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f", - "sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc", - "sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837", - "sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41", - "sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c", - "sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579", - "sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753", - "sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8", - "sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291", - "sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087", - "sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866", - "sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3", - "sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d", - "sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1", - "sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca", - "sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e", - "sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db", - "sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72", - "sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d", - "sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc", - "sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539", - "sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d", - "sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af", - "sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b", - "sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602", - "sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f", - "sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478", - "sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c", - "sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e", - "sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479", - "sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7", - "sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8" - ], - "version": "==3.0.1" + "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6", + "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1", + "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e", + "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373", + "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62", + "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230", + "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be", + "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c", + "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0", + "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448", + "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f", + "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649", + "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d", + "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0", + "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706", + "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a", + "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59", + "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23", + "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5", + "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb", + "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e", + "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e", + "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c", + "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28", + "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d", + "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41", + "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974", + "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce", + "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f", + "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1", + "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d", + "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8", + "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017", + "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31", + "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7", + "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8", + "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e", + "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14", + "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd", + "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d", + "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795", + "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b", + "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b", + "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b", + "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203", + "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f", + "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19", + "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1", + "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a", + "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac", + "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9", + "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0", + "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137", + "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f", + "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6", + "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5", + "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909", + "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f", + "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0", + "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324", + "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755", + "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb", + "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854", + "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c", + "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60", + "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84", + "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0", + "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b", + "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1", + "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531", + "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1", + "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11", + "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326", + "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df", + "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==3.1.0" }, "click": { "hashes": [ @@ -173,6 +230,39 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'", "version": "==0.4.6" }, + "cryptography": { + "hashes": [ + "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440", + "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288", + "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b", + "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958", + "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b", + "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d", + "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a", + "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404", + "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b", + "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e", + "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2", + "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c", + "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b", + "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9", + "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b", + "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636", + "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99", + "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e", + "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9" + ], + "markers": "python_version >= '3.6'", + "version": "==40.0.2" + }, + "datacompy": { + "hashes": [ + "sha256:0cd192d24e98dd0bccdaecf1a4450ff3a0bda1e17830235e3b1a8bfc9f0cbd98", + "sha256:4eccb4580e8173f3f56663092744b1e654dede97891341cf4869d4561c6804f5" + ], + "index": "pypi", + "version": "==0.8.4" + }, "deepdiff": { "hashes": [ "sha256:8d4eb2c4e6cbc80b811266419cb71dd95a157094a3947ccf937a94d44943c7b8", @@ -189,6 +279,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.2.13" }, + "deprecation": { + "hashes": [ + "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff", + "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a" + ], + "version": "==2.1.0" + }, "distlib": { "hashes": [ "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46", @@ -198,27 +295,27 @@ }, "exceptiongroup": { "hashes": [ - "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e", - "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23" + "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e", + "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785" ], "markers": "python_version < '3.11'", - "version": "==1.1.0" + "version": "==1.1.1" }, "filelock": { "hashes": [ - "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de", - "sha256:f58d535af89bb9ad5cd4df046f741f8553a418c01a7856bf0d173bbc9f6bd16d" + "sha256:3618c0da67adcc0506b015fd11ef7faf1b493f0b40d87728e19986b536890c37", + "sha256:f08a52314748335c6460fc8fe40cd5638b85001225db78c2aa01c8c0db83b318" ], "markers": "python_version >= '3.7'", - "version": "==3.9.0" + "version": "==3.11.0" }, "identify": { "hashes": [ - "sha256:7d526dd1283555aafcc91539acc061d8f6f59adb0a7bba462735b0a318bff7ed", - "sha256:93cc61a861052de9d4c541a7acb7e3dcc9c11b398a2144f6e52ae5285f5f4f06" + "sha256:f0faad595a4687053669c112004178149f6c326db71ee999ae4636685753ad2f", + "sha256:f7a93d6cf98e29bd07663c60728e7a4057615068d7a639d132dc883b2d54d31e" ], "markers": "python_version >= '3.7'", - "version": "==2.5.17" + "version": "==2.5.22" }, "idna": { "hashes": [ @@ -230,11 +327,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:7efb448ec9a5e313a57655d35aa54cd3e01b7e1fbcf72dce1bf06119420f5bad", - "sha256:e354bedeb60efa6affdcc8ae121b73544a7aa74156d047311948f6d711cd378d" + "sha256:23c2bcae4762dfb0bbe072d358faec24957901d75b6c4ab11172c0c982532402", + "sha256:8f8bd2af397cf33bd344d35cfe7f489219b7d14fc79a3f854b75b8417e9226b0" ], "markers": "python_version >= '3.7'", - "version": "==6.0.0" + "version": "==6.3.0" }, "iniconfig": { "hashes": [ @@ -331,6 +428,14 @@ "markers": "python_version >= '3.7'", "version": "==2.1.2" }, + "moto": { + "hashes": [ + "sha256:56de986179f79920f59243bc532e03a7039d24a5ee5aec2eb3b666dcd23d6262", + "sha256:fb9a7615f744da4ea7f154ff8e79782b19781344a6356ca4c0d6217c1237d379" + ], + "index": "pypi", + "version": "==4.1.7" + }, "networkx": { "hashes": [ "sha256:80b6b89c77d1dfb64a4c7854981b60aeea6360ac02c6d4e4913319e0a313abef", @@ -349,37 +454,37 @@ }, "numpy": { "hashes": [ - "sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22", - "sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f", - "sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9", - "sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96", - "sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0", - "sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a", - "sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281", - "sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04", - "sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468", - "sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253", - "sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756", - "sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a", - "sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb", - "sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d", - "sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0", - "sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910", - "sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978", - "sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5", - "sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f", - "sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a", - "sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5", - "sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2", - "sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d", - "sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95", - "sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5", - "sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d", - "sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780", - "sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa" + "sha256:0044f7d944ee882400890f9ae955220d29b33d809a038923d88e4e01d652acd9", + "sha256:0e3463e6ac25313462e04aea3fb8a0a30fb906d5d300f58b3bc2c23da6a15398", + "sha256:179a7ef0889ab769cc03573b6217f54c8bd8e16cef80aad369e1e8185f994cd7", + "sha256:2386da9a471cc00a1f47845e27d916d5ec5346ae9696e01a8a34760858fe9dd2", + "sha256:26089487086f2648944f17adaa1a97ca6aee57f513ba5f1c0b7ebdabbe2b9954", + "sha256:28bc9750ae1f75264ee0f10561709b1462d450a4808cd97c013046073ae64ab6", + "sha256:28e418681372520c992805bb723e29d69d6b7aa411065f48216d8329d02ba032", + "sha256:442feb5e5bada8408e8fcd43f3360b78683ff12a4444670a7d9e9824c1817d36", + "sha256:6ec0c021cd9fe732e5bab6401adea5a409214ca5592cd92a114f7067febcba0c", + "sha256:7094891dcf79ccc6bc2a1f30428fa5edb1e6fb955411ffff3401fb4ea93780a8", + "sha256:84e789a085aabef2f36c0515f45e459f02f570c4b4c4c108ac1179c34d475ed7", + "sha256:87a118968fba001b248aac90e502c0b13606721b1343cdaddbc6e552e8dfb56f", + "sha256:8e669fbdcdd1e945691079c2cae335f3e3a56554e06bbd45d7609a6cf568c700", + "sha256:ad2925567f43643f51255220424c23d204024ed428afc5aad0f86f3ffc080086", + "sha256:b0677a52f5d896e84414761531947c7a330d1adc07c3a4372262f25d84af7bf7", + "sha256:b07b40f5fb4fa034120a5796288f24c1fe0e0580bbfff99897ba6267af42def2", + "sha256:b09804ff570b907da323b3d762e74432fb07955701b17b08ff1b5ebaa8cfe6a9", + "sha256:b162ac10ca38850510caf8ea33f89edcb7b0bb0dfa5592d59909419986b72407", + "sha256:b31da69ed0c18be8b77bfce48d234e55d040793cebb25398e2a7d84199fbc7e2", + "sha256:caf65a396c0d1f9809596be2e444e3bd4190d86d5c1ce21f5fc4be60a3bc5b36", + "sha256:cfa1161c6ac8f92dea03d625c2d0c05e084668f4a06568b77a25a89111621566", + "sha256:dae46bed2cb79a58d6496ff6d8da1e3b95ba09afeca2e277628171ca99b99db1", + "sha256:ddc7ab52b322eb1e40521eb422c4e0a20716c271a306860979d450decbb51b8e", + "sha256:de92efa737875329b052982e37bd4371d52cabf469f83e7b8be9bb7752d67e51", + "sha256:e274f0f6c7efd0d577744f52032fdd24344f11c5ae668fe8d01aac0422611df1", + "sha256:ed5fb71d79e771ec930566fae9c02626b939e37271ec285e9efaf1b5d4370e7d", + "sha256:ef85cf1f693c88c1fd229ccd1055570cb41cdf4875873b7728b6301f12cd05bf", + "sha256:f1b739841821968798947d3afcefd386fa56da0caf97722a5de53e07c4ccedc7" ], "markers": "python_version >= '3.8'", - "version": "==1.24.2" + "version": "==1.24.1" }, "ordered-set": { "hashes": [ @@ -397,13 +502,40 @@ "markers": "python_version >= '3.6'", "version": "==21.3" }, + "pandas": { + "hashes": [ + "sha256:050aada67a5ec6699a7879e769825b510018a95fb9ac462bb1867483d0974a97", + "sha256:0959c41004e3d2d16f39c828d6da66ebee329836a7ecee49fb777ac9ad8a7501", + "sha256:4591cadd06fbbbd16fafc2de6e840c1aaefeae3d5864b688004777ef1bbdede3", + "sha256:51c424ca134fdaeac9a4acd719d1ab48046afc60943a489028f0413fdbe9ef1c", + "sha256:785e878a6e6d8ddcdb8c181e600855402750052497d7fc6d6b508894f6b8830b", + "sha256:799e6a25932df7e6b1f8dabf63de064e2205dc309abb75956126a0453fd88e97", + "sha256:7cd1d69a387f7d5e1a5a06a87574d9ef2433847c0e78113ab51c84d3a8bcaeaa", + "sha256:87b4194f344dcd14c0f885cecb22005329b38bda10f1aaf7b9596a00ec8a4768", + "sha256:8d4d2fe2863ecddb0ba1979bdda26c8bc2ea138f5a979abe3ba80c0fa4015c91", + "sha256:94f2ed1fd51e545ebf71da1e942fe1822ee01e10d3dd2a7276d01351333b7c6b", + "sha256:9d2a7a3c1fea668d56bd91edbd5f2732e0af8feb9d2bf8d9bfacb2dea5fa9536", + "sha256:9d805bce209714b1c1fa29bfb1e42ad87e4c0a825e4b390c56a3e71593b7e8d8", + "sha256:a08ceb59db499864c58a9bf85ab6219d527d91f14c0240cc25fa2c261032b2a7", + "sha256:a981cfabf51c318a562deb4ae7deec594c07aee7cf18b4594a92c23718ec8275", + "sha256:ab6c0d738617b675183e5f28db32b5148b694ad9bba0a40c3ea26d96b431db67", + "sha256:afbddad78a98ec4d2ce08b384b81730de1ccc975b99eb663e6dac43703f36d98", + "sha256:c4bb8b0ab9f94207d07e401d24baebfc63057246b1a5e0cd9ee50df85a656871", + "sha256:ce35f947202b0b99c660221d82beb91d2e6d553d55a40b30128204e3e2c63848", + "sha256:d0022fe6a313df1c4869b5edc012d734c6519a6fffa3cf70930f32e6a1078e49", + "sha256:e7cc960959be28d064faefc0cb2aef854d46b827c004ebea7e79b5497ed83e7d", + "sha256:ee6f1848148ed3204235967613b0a32be2d77f214e9623f554511047705c1e04" + ], + "index": "pypi", + "version": "==1.4.4" + }, "platformdirs": { "hashes": [ - "sha256:8a1228abb1ef82d788f74139988b137e78692984ec7b08eaa6c65f1723af28f9", - "sha256:b1d5eb14f221506f50d6604a561f4c5786d9e80355219694a1b244bcd96f4567" + "sha256:d5b638ca397f25f979350ff789db335903d7ea010ab28903f57b27e1b16c2b08", + "sha256:ebe11c0d7a805086e99506aa331612429a72ca7cd52a1f0d277dc4adc20cb10e" ], "markers": "python_version >= '3.7'", - "version": "==3.0.0" + "version": "==3.2.0" }, "pluggy": { "hashes": [ @@ -415,11 +547,11 @@ }, "pre-commit": { "hashes": [ - "sha256:9e3255edb0c9e7fe9b4f328cb3dc86069f8fdc38026f1bf521018a05eaf4d67b", - "sha256:bc4687478d55578c4ac37272fe96df66f73d9b5cf81be6f28627d4e712e752d5" + "sha256:0b4210aea813fe81144e87c5a291f09ea66f199f367fa1df41b55e1d26e1e2b4", + "sha256:5b808fcbda4afbccf6d6633a56663fed35b6c2bc08096fd3d47ce197ac351d9d" ], "index": "pypi", - "version": "==3.0.4" + "version": "==3.2.2" }, "pyarrow": { "hashes": [ @@ -452,6 +584,13 @@ "index": "pypi", "version": "==11.0.0" }, + "pycparser": { + "hashes": [ + "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9", + "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206" + ], + "version": "==2.21" + }, "pyparsing": { "hashes": [ "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", @@ -495,11 +634,11 @@ }, "pytest": { "hashes": [ - "sha256:c7c6ca206e93355074ae32f7403e8ea12163b1163c976fee7d4d84027c162be5", - "sha256:d45e0952f3727241918b8fd0f376f5ff6b301cc0777c6f9a556935c92d8a7d42" + "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362", + "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3" ], "index": "pypi", - "version": "==7.2.1" + "version": "==7.3.1" }, "python-dateutil": { "hashes": [ @@ -509,6 +648,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, + "pytz": { + "hashes": [ + "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588", + "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb" + ], + "version": "==2023.3" + }, "pyyaml": { "hashes": [ "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf", @@ -552,6 +698,14 @@ "markers": "python_version >= '3.7' and python_version < '4'", "version": "==2.28.2" }, + "responses": { + "hashes": [ + "sha256:8a3a5915713483bf353b6f4079ba8b2a29029d1d1090a503c70b0dc5d9d0c7bd", + "sha256:c4d9aa9fc888188f0c673eff79a8dadbe2e75b7fe879dc80a221a06e0a68138f" + ], + "markers": "python_version >= '3.7'", + "version": "==0.23.1" + }, "s3transfer": { "hashes": [ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd", @@ -562,18 +716,18 @@ }, "sceptre": { "hashes": [ - "sha256:03f4e40b757728de157e0bf1dd4e8c7f49582207bd728f46c6563a054bdee116", - "sha256:19ebea9cbb7abe0dd28d86f42c68e63f3cd0833a46afd87e9d618b6ff80fe35a" + "sha256:b8cd3445e58d0f423321d517de28df98a1896413c9f6d539da7b55701a103be1", + "sha256:df22e4bb2845c3a5494a90ce70387d0125ffd624fe203976c22d592158bf645c" ], "index": "pypi", - "version": "==3.3.0" + "version": "==4.0.2" }, "sceptre-cmd-resolver": { "hashes": [ - "sha256:4cc7409ee43923bc97dc60b6d3bec2e129876e1ab9ca0ba656d4f8c40b2ac87b", - "sha256:ff83298ae86a51df150de28cd17c3754c64aad2bc7813c3095cc5cfc7fad89f1" + "sha256:155c47e2f4f55c7b6eb64bfe8760174701442ecaddba1a6f5cb7715a1c95be99", + "sha256:eea8ce4cfcd9199f726b4280e7e35923c9d4ea5d75cbe4a8ee78c0d6d2996d09" ], - "version": "==1.2.1" + "version": "==2.0.0" }, "sceptre-file-resolver": { "hashes": [ @@ -584,19 +738,19 @@ }, "sceptre-sam-handler": { "hashes": [ - "sha256:556be971ecf66c35f00ac14289671927432b418f1710fb92601f35c2d12d954c", - "sha256:a4805fca3e8663d8a1d6d0c794c6de3de19b54b232274bf30f0c56fddcb97208" + "sha256:ba50af0b09807c7dbff6471bb07c4dceda46027d693a7f2db7adf915fba9205b", + "sha256:f7ed9f80fe7ed4b6b12ba215152d98661e634429958a40419250e085ae51bcad" ], "index": "pypi", - "version": "==0.3.1" + "version": "==1.0.0" }, "setuptools": { "hashes": [ - "sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c", - "sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48" + "sha256:257de92a9d50a60b8e22abfcbb771571fde0dbf3ec234463212027a4eeecbe9a", + "sha256:e728ca814a823bf7bf60162daf9db95b93d532948c4c0bea762ce62f60189078" ], "markers": "python_version >= '3.7'", - "version": "==67.2.0" + "version": "==67.6.1" }, "six": { "hashes": [ @@ -622,99 +776,133 @@ "markers": "python_version < '3.11'", "version": "==2.0.1" }, + "types-pyyaml": { + "hashes": [ + "sha256:5aed5aa66bd2d2e158f75dda22b059570ede988559f030cf294871d3b647e3e8", + "sha256:c51b1bd6d99ddf0aa2884a7a328810ebf70a4262c292195d3f4f9a0005f9eeb6" + ], + "version": "==6.0.12.9" + }, "urllib3": { "hashes": [ - "sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72", - "sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1" + "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305", + "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==1.26.14" + "version": "==1.26.15" }, "virtualenv": { "hashes": [ - "sha256:37a640ba82ed40b226599c522d411e4be5edb339a0c0de030c0dc7b646d61590", - "sha256:54eb59e7352b573aa04d53f80fc9736ed0ad5143af445a1e539aada6eb947dd1" + "sha256:31712f8f2a17bd06234fa97fdf19609e789dd4e3e4bf108c3da71d710651adbc", + "sha256:f50e3e60f990a0757c9b68333c9fdaa72d7188caa417f96af9e52407831a3b68" + ], + "markers": "python_version >= '3.7'", + "version": "==20.21.0" + }, + "werkzeug": { + "hashes": [ + "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe", + "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612" ], "markers": "python_version >= '3.7'", - "version": "==20.19.0" + "version": "==2.2.3" }, "wrapt": { "hashes": [ - "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3", - "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b", - "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4", - "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2", - "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656", - "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3", - "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff", - "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310", - "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a", - "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57", - "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069", - "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383", - "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe", - "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87", - "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d", - "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b", - "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907", - "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f", - "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0", - "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28", - "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1", - "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853", - "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc", - "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3", - "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3", - "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164", - "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1", - "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c", - "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1", - "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7", - "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1", - "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320", - "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed", - "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1", - "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248", - "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c", - "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456", - "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77", - "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef", - "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1", - "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7", - "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86", - "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4", - "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d", - "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d", - "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8", - "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5", - "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471", - "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00", - "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68", - "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3", - "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d", - "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735", - "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d", - "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569", - "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7", - "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59", - "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5", - "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb", - "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b", - "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f", - "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462", - "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015", - "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af" + "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0", + "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420", + "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a", + "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c", + "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079", + "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923", + "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f", + "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1", + "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8", + "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86", + "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0", + "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364", + "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e", + "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c", + "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e", + "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c", + "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727", + "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff", + "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e", + "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29", + "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7", + "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72", + "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475", + "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a", + "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317", + "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2", + "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd", + "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640", + "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98", + "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248", + "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e", + "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d", + "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec", + "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1", + "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e", + "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9", + "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92", + "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb", + "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094", + "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46", + "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29", + "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd", + "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705", + "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8", + "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975", + "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb", + "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e", + "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b", + "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418", + "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019", + "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1", + "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba", + "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6", + "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2", + "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3", + "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7", + "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752", + "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416", + "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f", + "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1", + "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc", + "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145", + "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee", + "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a", + "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7", + "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b", + "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653", + "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0", + "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90", + "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29", + "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6", + "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034", + "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09", + "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559", + "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==1.14.1" + "version": "==1.15.0" + }, + "xmltodict": { + "hashes": [ + "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56", + "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852" + ], + "markers": "python_version >= '3.4'", + "version": "==0.13.0" }, "zipp": { "hashes": [ - "sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6", - "sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b" + "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b", + "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556" ], "markers": "python_version >= '3.7'", - "version": "==3.13.0" + "version": "==3.15.0" } } } diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 438e2d32..d9ea03eb 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -4,6 +4,7 @@ import argparse import boto3 +import datacompy import pandas as pd import synapseclient from pyarrow import fs @@ -62,6 +63,10 @@ def read_args(): return args +def dataset_is_empty(dataset) -> bool: + return len(dataset.columns) == 0 + + def get_duplicated_index_fields(data_type: str, dataset: pd.DataFrame) -> pd.DataFrame: """Gets the rows of data that are duplicated based on the index columns by data type and returns them @@ -71,21 +76,18 @@ def get_duplicated_index_fields(data_type: str, dataset: pd.DataFrame) -> pd.Dat def get_duplicated_columns(dataset: pd.DataFrame) -> list: - """ Gets a list of duplicated columns in a dataframe - """ + """Gets a list of duplicated columns in a dataframe""" return dataset.columns[dataset.columns.duplicated()].tolist() def get_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: - """ Gets the list of common columns between two dataframes - """ + """Gets the list of common columns between two dataframes""" common_cols = staging_dataset.columns.intersection(main_dataset.columns).tolist() return common_cols def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: - """ Gets the list of missing columns present in main but not in staging - """ + """Gets the list of missing columns present in main but not in staging""" missing_cols = main_dataset.columns.difference(staging_dataset.columns).tolist() return missing_cols @@ -93,8 +95,7 @@ def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) def get_additional_cols( staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: - """ Gets the list of additional columns present in staging but not in main - """ + """Gets the list of additional columns present in staging but not in main""" add_cols = staging_dataset.columns.difference(main_dataset.columns).tolist() return add_cols @@ -206,8 +207,8 @@ def keep_common_rows_cols( def compare_column_data_types( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: - """ This compares the column data types of the common columns between - two datasets and creates a message if there are differences + """This compares the column data types of the common columns between + two datasets and creates a message if there are differences """ compare_msg = [] common_cols = get_common_cols(staging_dataset, main_dataset) @@ -337,58 +338,142 @@ def get_data_types_to_compare( return list(set(staging_datatype_folders + main_datatype_folders)) -def print_comparison_result(comparison_result: dict) -> None: - """"This prints the comparison result dictionary into a nice format""" - logger.info(logger.warning(f"Comparison results: {json.dumps(comparison_result)}")) - logger.info("Comparison results complete!") +def compare_datasets_and_export_report( + data_type: str, + staging_dataset: pd.DataFrame, + main_dataset: pd.DataFrame, + staging_namespace: str, + main_namespace: str, +) -> str: + """This method prints out a human-readable report summarizing and + sampling differences between datasets for the given data type. + A full list of comparisons can be found in the datacompy package site. + + Returns: + str: large string block of the report + """ + compare = datacompy.Compare( + df1=staging_dataset, + df2=main_dataset, + join_columns=INDEX_FIELD_MAP[data_type], + abs_tol=0, # Optional, defaults to 0 + rel_tol=0, # Optional, defaults to 0 + df1_name=staging_namespace, # Optional, defaults to 'df1' + df2_name=main_namespace, # Optional, defaults to 'df2' + ) + compare.matches(ignore_extra_columns=False) + return compare.report() + + +def add_additional_msg_to_comparison_report( + comparison_report: str, add_msgs: list +) -> str: + """This adds additional messages to the comparison report. Currently, this adds + messages that specify the names of columns that are different between the + two datasets + + Args: + comparison_report (str): report generated using datacompy + add_msgs (list): list of additional messages to include at the bottom of the report + + Returns: + str: updated comparison report with more specific messages + """ + # does some formatting. + joined_add_msgs = "\n".join(add_msgs) + updated_comparison_report = ( + f"{comparison_report}" + f"Column Name Differences\n" + f"-----------------------\n\n{joined_add_msgs}" + ) + return updated_comparison_report + + +def is_valid_dataset(dataset: pd.DataFrame, namespace: str) -> dict: + """Checks whether the individual dataset is valid under the following criteria: + - no duplicated columns + - dataset is not empty (aka has columns) + before it can go through the comparison + + Args: + dataset (pd.DataFrame): dataset to be validated + namespace (str): namespace for the dataset + + Returns: + dict: containing boolean of the validation result and string message + """ + # Check that datasets have no emptiness, duplicated columns, or have columns in common + if dataset_is_empty(dataset): + msg = f"{namespace} dataset has no data. Comparison cannot continue." + return {"result": False, "msg": msg} + elif get_duplicated_columns(dataset) != []: + msg = ( + f"{namespace} dataset has duplicated columns. Comparison cannot continue.\n" + f"Duplicated columns:{str(get_duplicated_columns(dataset))}" + ) + return {"result": False, "msg": msg} + else: + msg = f"{namespace} dataset has been validated." + return {"result": True, "msg": msg} def compare_datasets_by_data_type( - args, - s3_filesystem: fs.S3FileSystem, - data_type: str, - comparison_result: dict, -) -> dict: - """This runs the bulk of the comparison functions from beginning to end by data type""" + args, s3_filesystem: fs.S3FileSystem, data_type: str +) -> str: + """This runs the bulk of the comparison functions from beginning to end by data type + + Args: + args: arguments from command line + s3_filesystem (fs.S3FileSystem): filesystem instantiated by aws credentials + data_type (str): data type to be compared for the given datasets + + Returns: + str: final report on the datasets for the given data type + """ + data_type = "dataset_fitbitactivitylogs" + header_msg = ( + f"\n\nParquet Dataset Comparison running for Data Type: {data_type}" + f"\n-------------------------------------------------------------------------------\n\n" + ) staging_dataset = get_parquet_dataset( dataset_key=f"s3://{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, ) - main_dataset = get_parquet_dataset( dataset_key=f"s3://{args.parquet_bucket}/{args.main_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, ) - # check if one or both of the datasets have no data - if staging_dataset.empty or main_dataset.empty: - comparison_result[ - data_type - ] = f"One of {args.staging_namespace} or {args.main_namespace} has no data. Comparison cannot continue." + # go through specific validation for each dataset prior to comparison + staging_is_valid_result = is_valid_dataset(staging_dataset, args.staging_namespace) + main_is_valid_result = is_valid_dataset(main_dataset, args.main_namespace) + if ( + staging_is_valid_result["result"] == False + or main_is_valid_result["result"] == False + ): + comparison_report = f"{header_msg}{staging_is_valid_result['msg']}\n{main_is_valid_result['msg']}" + return comparison_report + + # check that they have columns in common to compare + elif get_common_cols(staging_dataset, main_dataset) == []: + comparison_report = ( + f"{header_msg}{args.staging_namespace} dataset and {args.main_namespace} has no columns in common." + f"Comparison cannot continue." + ) + return comparison_report else: - # check that the dataset has no dup cols or dup rows and that they have cols in common - comparison_result[data_type] = [] - # check if one or both of the datasets have no data - if staging_dataset.empty or main_dataset.empty: - comparison_result["empty"][ - data_type - ] = f"One of {args.staging_namespace} or {args.main_namespace} has no data. Comparison cannot continue." - else: - comparison_result[data_type].append( - compare_column_data_types(data_type, staging_dataset, main_dataset) - ) - comparison_result[data_type].append( - compare_column_names(data_type, staging_dataset, main_dataset) - ) - comparison_result[data_type].append( - compare_column_vals(data_type, staging_dataset, main_dataset) - ) - comparison_result[data_type].append( - compare_num_of_rows(data_type, staging_dataset, main_dataset) - ) - comparison_result[data_type].append( - compare_dataset_row_vals(data_type, staging_dataset, main_dataset) - ) - return comparison_result + add_msgs = compare_column_names(data_type, staging_dataset, main_dataset) + comparison_report = compare_datasets_and_export_report( + data_type, + staging_dataset, + main_dataset, + args.staging_namespace, + args.main_namespace, + ) + comparison_report = f"{header_msg}{comparison_report}" + comparison_report = add_additional_msg_to_comparison_report( + comparison_report, add_msgs + ) + return comparison_report def main(): @@ -411,10 +496,8 @@ def main(): staging_namespace=args.staging_namespace, ) for data_type in data_types_to_compare: - comparison_result = compare_datasets_by_data_type( - args, fs, data_type, comparison_result - ) - print_comparison_result(comparison_result) + comparison_report = compare_datasets_by_data_type(args, fs, data_type) + print(comparison_report) return diff --git a/tests/conftest.py b/tests/conftest.py index faa4ac4b..f405a585 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -223,5 +223,15 @@ def staging_dataset_with_all_col_val_diff(): ) @pytest.fixture() -def empty_dataset(): - pass +def staging_dataset_with_empty_columns(): + return pd.DataFrame( + { + "LogId": [], + "StartDate": [], + "EndDate": [], + } + ) + +@pytest.fixture() +def staging_dataset_empty(): + return pd.DataFrame() diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index 408cd9cf..53c11ee6 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -5,7 +5,7 @@ from moto import mock_s3 from pandas.testing import assert_frame_equal -from glue.jobs import compare_parquet_datasets as compare_parquet +from src.glue.jobs import compare_parquet_datasets as compare_parquet def test_that_get_duplicated_index_fields_returns_empty_df_if_no_dup_exist( @@ -353,3 +353,68 @@ def test_that_compare_dataset_row_vals_returns_msg_if_diff( valid_main_dataset, ) assert compare_msg != [] + + +def test_that_is_valid_dataset_returns_true_if_dataset_is_valid(valid_staging_dataset): + is_valid_result = compare_parquet.is_valid_dataset(valid_staging_dataset, "staging") + assert ( + is_valid_result["result"] is True + and is_valid_result["msg"] == "staging dataset has been validated." + ) + + +def test_that_is_valid_dataset_returns_false_if_dataset_is_empty(staging_dataset_empty): + is_valid_result = compare_parquet.is_valid_dataset(staging_dataset_empty, "staging") + assert ( + is_valid_result["result"] is False + and is_valid_result["msg"] + == "staging dataset has no data. Comparison cannot continue." + ) + + +def test_that_is_valid_dataset_returns_false_if_dataset_has_dup_cols( + staging_dataset_with_dup_cols, +): + is_valid_result = compare_parquet.is_valid_dataset( + staging_dataset_with_dup_cols, "staging" + ) + assert is_valid_result["result"] is False and is_valid_result["msg"] == ( + "staging dataset has duplicated columns. Comparison cannot continue.\n" + "Duplicated columns:['EndDate']" + ) + + +def test_that_is_valid_dataset_returns_true_if_dataset_has_empty_cols( + staging_dataset_with_empty_columns, +): + is_valid_result = compare_parquet.is_valid_dataset( + staging_dataset_with_empty_columns, "staging" + ) + assert ( + is_valid_result["result"] is True + and is_valid_result["msg"] == "staging dataset has been validated." + ) + + +def test_that_compare_datasets_and_export_report_outputs_something_if_input_is_valid( + valid_staging_dataset, valid_main_dataset +): + comparison_report = compare_parquet.compare_datasets_and_export_report( + "dataset_fitbitactivitylogs", + valid_staging_dataset, + valid_main_dataset, + "staging", + "main", + ) + assert comparison_report is not False + + +def test_that_add_additional_msg_to_comparison_report_outputs_correct_updated_msg(): + comparison_report = "some string\n\n" + add_msgs = ["one message", "two message"] + result = compare_parquet.add_additional_msg_to_comparison_report(comparison_report, add_msgs) + assert result == ( + "some string\n\nColumn Name Differences\n" + "-----------------------\n\n" + "one message\ntwo message" + ) From 4174e0c3d7c8c54515aeeb290ac4c495051fd123 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Fri, 14 Apr 2023 17:18:32 -0700 Subject: [PATCH 04/18] remove all unused functions and tests covered by datacompy, fix method for getting folders from s3, add add. test coverage --- src/glue/jobs/compare_parquet_datasets.py | 196 +++++--------------- tests/test_compare_parquet_datasets.py | 208 +++------------------- 2 files changed, 70 insertions(+), 334 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index d9ea03eb..0e3811bb 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -10,8 +10,6 @@ from pyarrow import fs import pyarrow.parquet as pq -# from json_to_parquet import INDEX_FIELD_MAP - logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -63,27 +61,15 @@ def read_args(): return args -def dataset_is_empty(dataset) -> bool: - return len(dataset.columns) == 0 - - -def get_duplicated_index_fields(data_type: str, dataset: pd.DataFrame) -> pd.DataFrame: - """Gets the rows of data that are duplicated based on the index columns by data type - and returns them - """ - index_cols = INDEX_FIELD_MAP[data_type] - return dataset[dataset.duplicated(subset=index_cols)] - - def get_duplicated_columns(dataset: pd.DataFrame) -> list: """Gets a list of duplicated columns in a dataframe""" return dataset.columns[dataset.columns.duplicated()].tolist() -def get_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: +def has_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: """Gets the list of common columns between two dataframes""" common_cols = staging_dataset.columns.intersection(main_dataset.columns).tolist() - return common_cols + return common_cols != [] def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: @@ -152,77 +138,20 @@ def get_folders_in_s3_bucket( Returns: list: folder names inside S3 bucket """ - response = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{namespace}/parquet/") - if "Contents" in response.keys(): - contents = response["Contents"] + response = s3.list_objects_v2( + Bucket=bucket_name, Prefix=f"{namespace}/parquet/", Delimiter="/" + ) + if "CommonPrefixes" in response.keys(): + contents = response["CommonPrefixes"] folders = [ - content["Key"].split("/")[-1] + os.path.normpath(content["Prefix"]).split(os.sep)[-1] for content in contents - if content["Key"].split("/")[-1] != "owner.txt" ] else: folders = [] return folders -def keep_common_rows_cols( - data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame -) -> dict: - """This function keeps the common columns between the two - given datasets. This function also merges on the index fields in - common between the two datasets so that the dataset can be - reduced to the same dimensions and be comparable - - Args: - data_type (str): current data type - staging_dataset (pd.DataFrame): "new" data that is to go through processing - main_dataset (pd.DataFrame): "established" dataset - - Returns: - dict of staging dataset and main datasets - """ - index_cols = INDEX_FIELD_MAP[data_type] - common_cols = get_common_cols(staging_dataset, main_dataset) - # convert to having same columns - staging_dataset_subset = staging_dataset[common_cols].add_suffix("_staging") - main_dataset_subset = main_dataset[common_cols].add_suffix("_main") - - # merging on index to get rid of extra rows - merged_dataset = staging_dataset_subset.merge( - main_dataset_subset, - left_on=[f"{col}_staging" for col in index_cols], - right_on=[f"{col}_main" for col in index_cols], - how="inner", - ) - staging_dataset_common = merged_dataset[staging_dataset_subset.columns] - main_dataset_common = merged_dataset[main_dataset_subset.columns] - - staging_dataset_common.columns = staging_dataset_common.columns.str.removesuffix( - "_staging" - ) - main_dataset_common.columns = main_dataset_common.columns.str.removesuffix("_main") - return {"staging": staging_dataset_common, "main": main_dataset_common} - - -def compare_column_data_types( - data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame -) -> list: - """This compares the column data types of the common columns between - two datasets and creates a message if there are differences - """ - compare_msg = [] - common_cols = get_common_cols(staging_dataset, main_dataset) - for common_col in common_cols: - if staging_dataset[common_col].dtype != main_dataset[common_col].dtype: - compare_msg.append( - ( - f"{data_type}: Staging dataset's {common_col} has data type {staging_dataset[common_col].dtype}.\n" - f"Main dataset's {common_col} has data type {staging_dataset[common_col].dtype}." - ) - ) - return compare_msg - - def compare_column_names( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: @@ -242,25 +171,18 @@ def compare_column_names( return compare_msg -def compare_column_vals( - data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame +def get_data_types_to_compare( + s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str ) -> list: - """This compares the column values between the common columns of two - datasets after the datasets have been reduced to the same dimensions - and outputs a message if any columns have all of their values as different""" - compare_msg = [] - dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) - dataset_diff = dataset_dict["staging"].compare( - other=dataset_dict["main"], align_axis="columns", keep_shape=True + """This gets the common data types to run the comparison of the parquet datasets from + the two namespaced paths on based on the folders in the s3 bucket""" + staging_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=staging_namespace ) - dataset_diff_cnt = dataset_diff.isna().sum() - dataset_diff_cnt = dataset_diff_cnt[dataset_diff_cnt == 0].to_dict() - if dataset_diff_cnt: - compare_msg.append( - f"{data_type}: Staging dataset has column(s) with value differences with the main dataset:\n" - f"{str(list(dataset_diff_cnt.keys()))}" - ) - return compare_msg + main_datatype_folders = get_folders_in_s3_bucket( + s3, bucket_name, namespace=main_namespace + ) + return list(set(staging_datatype_folders + main_datatype_folders)) def compare_dataset_data_types( @@ -291,54 +213,7 @@ def compare_dataset_data_types( return compare_msg -def compare_num_of_rows( - data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame -) -> list: - """This compares the number of rows between two datasets and outputs a message - if there are any row count differences""" - compare_msg = [] - if staging_dataset.shape[0] != main_dataset.shape[0]: - compare_msg.append( - f"{data_type}: Staging dataset has {staging_dataset.shape[0]} rows of data.\n" - f"Main dataset has {main_dataset.shape[0]} rows of data." - ) - return compare_msg - - -def compare_dataset_row_vals( - data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame -) -> list: - """This compares the row values between the two - datasets after the datasets have been reduced to the same dimensions - and outputs a message if any rows have differences""" - compare_msg = [] - dataset_dict = keep_common_rows_cols(data_type, staging_dataset, main_dataset) - dataset_diff = dataset_dict["staging"].compare( - other=dataset_dict["main"], align_axis="columns", keep_equal=False - ) - if not dataset_diff.empty: - compare_msg.append( - f"{data_type}: Staging dataset has value difference(s) with the main dataset." - f"Here is an example:\n{dataset_diff.head(1)}" - ) - return compare_msg - - -def get_data_types_to_compare( - s3: boto3.client, bucket_name: str, staging_namespace: str, main_namespace: str -) -> list: - """This gets the common data types to run the comparison of the parquet datasets from - the two namespaced paths on based on the folders in the s3 bucket""" - staging_datatype_folders = get_folders_in_s3_bucket( - s3, bucket_name, namespace=staging_namespace - ) - main_datatype_folders = get_folders_in_s3_bucket( - s3, bucket_name, namespace=main_namespace - ) - return list(set(staging_datatype_folders + main_datatype_folders)) - - -def compare_datasets_and_export_report( +def compare_datasets_and_output_report( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame, @@ -403,7 +278,7 @@ def is_valid_dataset(dataset: pd.DataFrame, namespace: str) -> dict: dict: containing boolean of the validation result and string message """ # Check that datasets have no emptiness, duplicated columns, or have columns in common - if dataset_is_empty(dataset): + if len(dataset.columns) == 0: msg = f"{namespace} dataset has no data. Comparison cannot continue." return {"result": False, "msg": msg} elif get_duplicated_columns(dataset) != []: @@ -418,22 +293,23 @@ def is_valid_dataset(dataset: pd.DataFrame, namespace: str) -> dict: def compare_datasets_by_data_type( - args, s3_filesystem: fs.S3FileSystem, data_type: str + args, s3_filesystem: fs.S3FileSystem, data_type_s3_folder_path: str, data_type: str ) -> str: """This runs the bulk of the comparison functions from beginning to end by data type Args: args: arguments from command line s3_filesystem (fs.S3FileSystem): filesystem instantiated by aws credentials + data_type_s3_folder_path (str): path to the dataset's data type folder to read in + datasets for comparison data_type (str): data type to be compared for the given datasets Returns: str: final report on the datasets for the given data type """ - data_type = "dataset_fitbitactivitylogs" header_msg = ( f"\n\nParquet Dataset Comparison running for Data Type: {data_type}" - f"\n-------------------------------------------------------------------------------\n\n" + f"\n-----------------------------------------------------------------\n\n" ) staging_dataset = get_parquet_dataset( dataset_key=f"s3://{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/", @@ -454,15 +330,14 @@ def compare_datasets_by_data_type( return comparison_report # check that they have columns in common to compare - elif get_common_cols(staging_dataset, main_dataset) == []: + elif not has_common_cols(staging_dataset, main_dataset): comparison_report = ( - f"{header_msg}{args.staging_namespace} dataset and {args.main_namespace} has no columns in common." - f"Comparison cannot continue." + f"{header_msg}{args.staging_namespace} dataset and {args.main_namespace} have no columns in common." + f" Comparison cannot continue." ) return comparison_report else: - add_msgs = compare_column_names(data_type, staging_dataset, main_dataset) - comparison_report = compare_datasets_and_export_report( + comparison_report = compare_datasets_and_output_report( data_type, staging_dataset, main_dataset, @@ -471,7 +346,8 @@ def compare_datasets_by_data_type( ) comparison_report = f"{header_msg}{comparison_report}" comparison_report = add_additional_msg_to_comparison_report( - comparison_report, add_msgs + comparison_report, + add_msgs=compare_column_names(data_type, staging_dataset, main_dataset), ) return comparison_report @@ -496,8 +372,18 @@ def main(): staging_namespace=args.staging_namespace, ) for data_type in data_types_to_compare: - comparison_report = compare_datasets_by_data_type(args, fs, data_type) + s3_folder_path = ( + f"{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/" + ) + comparison_report = compare_datasets_by_data_type( + args, fs, s3_folder_path, data_type + ) print(comparison_report) + s3.put_object( + Bucket=args.parquet_bucket, + Key=f"{s3_folder_path}/logs/parquet_comparison_report.txt", + Body=comparison_report, + ) return diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index 53c11ee6..feedfae5 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -8,34 +8,6 @@ from src.glue.jobs import compare_parquet_datasets as compare_parquet -def test_that_get_duplicated_index_fields_returns_empty_df_if_no_dup_exist( - valid_staging_dataset, -): - assert ( - compare_parquet.get_duplicated_index_fields( - "dataset_fitbitactivitylogs", valid_staging_dataset - ).empty - == True - ) - - -def test_that_get_duplicated_index_fields_returns_dup_df_if_dup_exist( - staging_dataset_with_dup_indexes, -): - assert_frame_equal( - compare_parquet.get_duplicated_index_fields( - "dataset_fitbitactivitylogs", staging_dataset_with_dup_indexes - ).reset_index(drop=True), - pd.DataFrame( - { - "LogId": ["44984262767"], - "StartDate": ["2021-12-24T14:27:39+00:00"], - "EndDate": ["2021-12-24T14:40:27+00:00"], - } - ).reset_index(drop=True), - ) - - def test_that_get_duplicated_columns_returns_empty_if_no_dup_exist( valid_staging_dataset, ): @@ -93,69 +65,38 @@ def test_that_get_folders_in_s3_bucket_returns_list_if_folder_exists( s3, parquet_bucket_name ): s3.create_bucket(Bucket=parquet_bucket_name) - s3.put_object( - Bucket=parquet_bucket_name, Key="staging/parquet/dataset_fitbitactivitylogs" - ) - result = compare_parquet.get_folders_in_s3_bucket( - s3, bucket_name=parquet_bucket_name, namespace="staging" - ) - assert result == ["dataset_fitbitactivitylogs"] - - -def test_that_keep_common_rows_cols_returns_same_df_when_both_df_are_the_same( - valid_staging_dataset, valid_main_dataset -): - datasets = compare_parquet.keep_common_rows_cols( - "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset - ) - assert_frame_equal(datasets["staging"], valid_staging_dataset) - assert_frame_equal(datasets["main"], valid_main_dataset) - -def test_that_keep_common_rows_cols_returns_correct_df_when_staging_df_has_less_rows( - staging_dataset_with_diff_num_of_rows, valid_main_dataset -): - datasets = compare_parquet.keep_common_rows_cols( + for obj in [ "dataset_fitbitactivitylogs", - staging_dataset_with_diff_num_of_rows, - valid_main_dataset, - ) - assert_frame_equal(datasets["staging"], staging_dataset_with_diff_num_of_rows) - assert_frame_equal(datasets["main"], staging_dataset_with_diff_num_of_rows) + "dataset_fitbitactivitylogs/test.txt", + "dataset_fitbitprofiles", + "dataset_fitbitactivitylogs/test2.txt", + "dataset_fitbitprofiles/test.txt", + ]: + s3.put_object(Bucket=parquet_bucket_name, Key=f"staging/parquet/{obj}") - -def test_that_keep_common_rows_cols_returns_correct_df_when_staging_df_has_more_col( - staging_dataset_with_add_cols, valid_main_dataset -): - datasets = compare_parquet.keep_common_rows_cols( - "dataset_fitbitactivitylogs", staging_dataset_with_add_cols, valid_main_dataset + result = compare_parquet.get_folders_in_s3_bucket( + s3, bucket_name=parquet_bucket_name, namespace="staging" ) - assert_frame_equal(datasets["staging"], valid_main_dataset) - assert_frame_equal(datasets["main"], valid_main_dataset) + assert result == ["dataset_fitbitactivitylogs", "dataset_fitbitprofiles"] -def test_that_get_common_cols_returns_empty_list_if_no_common_cols( +def test_that_has_common_cols_returns_false_if_no_common_cols( staging_dataset_with_no_common_cols, valid_main_dataset ): - test_common_cols = compare_parquet.get_common_cols( + test_common_cols = compare_parquet.has_common_cols( staging_dataset_with_no_common_cols, valid_main_dataset ) - assert test_common_cols == [] + assert test_common_cols is False -def test_that_get_common_cols_returns_list_of_cols_if_common_cols( +def test_that_has_common_cols_returns_true_if_common_cols( valid_staging_dataset, valid_main_dataset ): - test_common_cols = compare_parquet.get_common_cols( + test_common_cols = compare_parquet.has_common_cols( valid_staging_dataset, valid_main_dataset ) - assert test_common_cols == [ - "LogId", - "StartDate", - "EndDate", - "ActiveDuration", - "Calories", - ] + assert test_common_cols is True def test_that_get_missing_cols_returns_empty_list_if_no_missing_cols( @@ -194,34 +135,6 @@ def test_that_get_additional_cols_returns_list_of_cols_if_add_cols( assert test_add_cols == ["AverageHeartRate"] -def test_that_compare_column_data_types_returns_empty_msg_if_no_common_cols( - staging_dataset_with_no_common_cols, valid_main_dataset -): - compare_msg = compare_parquet.compare_column_data_types( - "dataset_fitbitactivitylogs", - staging_dataset_with_no_common_cols, - valid_main_dataset, - ) - assert compare_msg == [] - - -def test_that_compare_column_data_types_returns_msg_if_diff_data_types( - staging_dataset_with_diff_data_type_cols, valid_main_dataset -): - compare_msg = compare_parquet.compare_column_data_types( - "dataset_fitbitactivitylogs", - staging_dataset_with_diff_data_type_cols, - valid_main_dataset, - ) - - assert compare_msg == [ - "dataset_fitbitactivitylogs: Staging dataset's ActiveDuration has data type int64.\n" - "Main dataset's ActiveDuration has data type object.", - "dataset_fitbitactivitylogs: Staging dataset's Calories has data type float64.\n" - "Main dataset's Calories has data type object.", - ] - - def test_that_compare_column_names_returns_empty_msg_if_cols_are_same( valid_staging_dataset, valid_main_dataset ): @@ -248,29 +161,6 @@ def test_that_compare_column_names_returns_msg_if_cols_are_diff( ] -def test_that_compare_column_vals_returns_empty_msg_if_no_col_val_diff( - valid_staging_dataset, valid_main_dataset -): - compare_msg = compare_parquet.compare_column_vals( - "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset - ) - assert compare_msg == [] - - -def test_that_compare_column_vals_returns_msg_if_all_col_val_are_diff( - staging_dataset_with_all_col_val_diff, valid_main_dataset -): - compare_msg = compare_parquet.compare_column_vals( - "dataset_fitbitactivitylogs", - staging_dataset_with_all_col_val_diff, - valid_main_dataset, - ) - assert compare_msg == [ - "dataset_fitbitactivitylogs: Staging dataset has column(s) with value differences with the main dataset:\n" - "[('EndDate', 'self'), ('EndDate', 'other')]" - ] - - @mock_s3 def test_that_compare_dataset_data_types_returns_empty_msg_if_datatypes_are_equal( s3, parquet_bucket_name @@ -292,10 +182,16 @@ def test_that_compare_dataset_data_types_returns_msg_if_datatypes_are_not_equal( s3, parquet_bucket_name ): s3.create_bucket(Bucket=parquet_bucket_name) - for datatype in ["dataset_fitbitactivitylogs", "dataset_fitbitintradaycombined"]: + for datatype in [ + "dataset_fitbitactivitylogs/test.txt", + "dataset_fitbitintradaycombined/test.txt", + ]: s3.put_object(Bucket=parquet_bucket_name, Key=f"staging/parquet/{datatype}") - for datatype in ["dataset_fitbitactivitylogs", "dataset_fitbitdevices"]: + for datatype in [ + "dataset_fitbitactivitylogs/test.txt", + "dataset_fitbitdevices/test.txt", + ]: s3.put_object(Bucket=parquet_bucket_name, Key=f"main/parquet/{datatype}") compare_msg = compare_parquet.compare_dataset_data_types( @@ -307,54 +203,6 @@ def test_that_compare_dataset_data_types_returns_msg_if_datatypes_are_not_equal( ] -def test_that_compare_num_of_rows_returns_empty_msg_if_num_of_rows_are_equal( - valid_staging_dataset, valid_main_dataset -): - compare_msg = compare_parquet.compare_num_of_rows( - "dataset_fitbitactivitylogs", - valid_staging_dataset, - valid_main_dataset, - ) - assert compare_msg == [] - - -def test_that_compare_num_of_rows_returns_msg_if_num_of_rows_are_diff( - staging_dataset_with_diff_num_of_rows, valid_main_dataset -): - compare_msg = compare_parquet.compare_num_of_rows( - "dataset_fitbitactivitylogs", - staging_dataset_with_diff_num_of_rows, - valid_main_dataset, - ) - - assert compare_msg == [ - "dataset_fitbitactivitylogs: Staging dataset has 1 rows of data.\n" - "Main dataset has 3 rows of data." - ] - - -def test_that_compare_dataset_row_vals_returns_empty_msg_if_no_diff( - valid_staging_dataset, valid_main_dataset -): - compare_msg = compare_parquet.compare_dataset_row_vals( - "dataset_fitbitactivitylogs", - valid_staging_dataset, - valid_main_dataset, - ) - assert compare_msg == [] - - -def test_that_compare_dataset_row_vals_returns_msg_if_diff( - staging_dataset_with_all_col_val_diff, valid_main_dataset -): - compare_msg = compare_parquet.compare_dataset_row_vals( - "dataset_fitbitactivitylogs", - staging_dataset_with_all_col_val_diff, - valid_main_dataset, - ) - assert compare_msg != [] - - def test_that_is_valid_dataset_returns_true_if_dataset_is_valid(valid_staging_dataset): is_valid_result = compare_parquet.is_valid_dataset(valid_staging_dataset, "staging") assert ( @@ -396,10 +244,10 @@ def test_that_is_valid_dataset_returns_true_if_dataset_has_empty_cols( ) -def test_that_compare_datasets_and_export_report_outputs_something_if_input_is_valid( +def test_that_compare_datasets_and_output_report_outputs_something_if_input_is_valid( valid_staging_dataset, valid_main_dataset ): - comparison_report = compare_parquet.compare_datasets_and_export_report( + comparison_report = compare_parquet.compare_datasets_and_output_report( "dataset_fitbitactivitylogs", valid_staging_dataset, valid_main_dataset, @@ -412,7 +260,9 @@ def test_that_compare_datasets_and_export_report_outputs_something_if_input_is_v def test_that_add_additional_msg_to_comparison_report_outputs_correct_updated_msg(): comparison_report = "some string\n\n" add_msgs = ["one message", "two message"] - result = compare_parquet.add_additional_msg_to_comparison_report(comparison_report, add_msgs) + result = compare_parquet.add_additional_msg_to_comparison_report( + comparison_report, add_msgs + ) assert result == ( "some string\n\nColumn Name Differences\n" "-----------------------\n\n" From da6dafcb8872ad003aa9ee1fda759ac5449a1a6a Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 17 Apr 2023 11:42:04 -0700 Subject: [PATCH 05/18] add syntax updates, clean up parquet dataset test, add comparison report for edge scenarios like no data types in common --- src/glue/jobs/compare_parquet_datasets.py | 81 +++++++++----- tests/conftest.py | 36 +++++-- tests/test_compare_parquet_datasets.py | 126 ++++++++++++++++------ 3 files changed, 178 insertions(+), 65 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 0e3811bb..090f8d02 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -1,14 +1,14 @@ -import os +import argparse import json +import os import logging -import argparse import boto3 import datacompy import pandas as pd -import synapseclient from pyarrow import fs import pyarrow.parquet as pq +import synapseclient logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -138,6 +138,7 @@ def get_folders_in_s3_bucket( Returns: list: folder names inside S3 bucket """ + response = s3.list_objects_v2( Bucket=bucket_name, Prefix=f"{namespace}/parquet/", Delimiter="/" ) @@ -182,7 +183,7 @@ def get_data_types_to_compare( main_datatype_folders = get_folders_in_s3_bucket( s3, bucket_name, namespace=main_namespace ) - return list(set(staging_datatype_folders + main_datatype_folders)) + return list(set(staging_datatype_folders) & set(main_datatype_folders)) def compare_dataset_data_types( @@ -281,7 +282,7 @@ def is_valid_dataset(dataset: pd.DataFrame, namespace: str) -> dict: if len(dataset.columns) == 0: msg = f"{namespace} dataset has no data. Comparison cannot continue." return {"result": False, "msg": msg} - elif get_duplicated_columns(dataset) != []: + elif get_duplicated_columns(dataset): msg = ( f"{namespace} dataset has duplicated columns. Comparison cannot continue.\n" f"Duplicated columns:{str(get_duplicated_columns(dataset))}" @@ -293,12 +294,19 @@ def is_valid_dataset(dataset: pd.DataFrame, namespace: str) -> dict: def compare_datasets_by_data_type( - args, s3_filesystem: fs.S3FileSystem, data_type_s3_folder_path: str, data_type: str + parquet_bucket: str, + staging_namespace: str, + main_namespace: str, + s3_filesystem: fs.S3FileSystem, + data_type_s3_folder_path: str, + data_type: str, ) -> str: """This runs the bulk of the comparison functions from beginning to end by data type Args: - args: arguments from command line + parquet_bucket (str): name of the bucket containing the parquet datasets + staging_namespace (str): name of namespace containing the "new" data + main_namespace (str): name of namespace containing the "established" data s3_filesystem (fs.S3FileSystem): filesystem instantiated by aws credentials data_type_s3_folder_path (str): path to the dataset's data type folder to read in datasets for comparison @@ -312,16 +320,16 @@ def compare_datasets_by_data_type( f"\n-----------------------------------------------------------------\n\n" ) staging_dataset = get_parquet_dataset( - dataset_key=f"s3://{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/", + dataset_key=f"s3://{parquet_bucket}/{staging_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, ) main_dataset = get_parquet_dataset( - dataset_key=f"s3://{args.parquet_bucket}/{args.main_namespace}/parquet/{data_type}/", + dataset_key=f"s3://{parquet_bucket}/{main_namespace}/parquet/{data_type}/", s3_filesystem=s3_filesystem, ) # go through specific validation for each dataset prior to comparison - staging_is_valid_result = is_valid_dataset(staging_dataset, args.staging_namespace) - main_is_valid_result = is_valid_dataset(main_dataset, args.main_namespace) + staging_is_valid_result = is_valid_dataset(staging_dataset, staging_namespace) + main_is_valid_result = is_valid_dataset(main_dataset, main_namespace) if ( staging_is_valid_result["result"] == False or main_is_valid_result["result"] == False @@ -332,7 +340,7 @@ def compare_datasets_by_data_type( # check that they have columns in common to compare elif not has_common_cols(staging_dataset, main_dataset): comparison_report = ( - f"{header_msg}{args.staging_namespace} dataset and {args.main_namespace} have no columns in common." + f"{header_msg}{staging_namespace} dataset and {main_namespace} have no columns in common." f" Comparison cannot continue." ) return comparison_report @@ -341,8 +349,8 @@ def compare_datasets_by_data_type( data_type, staging_dataset, main_dataset, - args.staging_namespace, - args.main_namespace, + staging_namespace, + main_namespace, ) comparison_report = f"{header_msg}{comparison_report}" comparison_report = add_additional_msg_to_comparison_report( @@ -359,29 +367,54 @@ def main(): aws_session = boto3.session.Session(profile_name="default", region_name="us-east-1") fs = get_S3FileSystem_from_session(aws_session) - comparison_result["missing_data_types"] = compare_dataset_data_types( + data_types_to_compare = get_data_types_to_compare( s3, args.parquet_bucket, main_namespace=args.main_namespace, staging_namespace=args.staging_namespace, ) - data_types_to_compare = get_data_types_to_compare( + data_types_diff = compare_dataset_data_types( s3, args.parquet_bucket, main_namespace=args.main_namespace, staging_namespace=args.staging_namespace, ) - for data_type in data_types_to_compare: - s3_folder_path = ( - f"{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/" - ) - comparison_report = compare_datasets_by_data_type( - args, fs, s3_folder_path, data_type + data_types_diff_msg = "\n".join(data_types_diff) + if data_types_to_compare: + for data_type in data_types_to_compare: + s3_folder_path = ( + f"{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/" + ) + comparison_report = compare_datasets_by_data_type( + args.parquet_bucket, + args.staging_namespace, + args.main_namespace, + fs, + s3_folder_path, + data_type, + ) + # update comparison report with the data_type differences message + comparison_report = ( + f"{comparison_report}\n\n" + f"Data Type Differences between {args.staging_namespace} and {args.main_namespace}:\n" + f"{data_types_diff_msg}" + ) + print(comparison_report) + # save comparison report to report folder in staging namespace + s3.put_object( + Bucket=args.parquet_bucket, + Key=f"{args.parquet_bucket}/{args.staging_namespace}/comparison_result/{data_type}_parquet_compare.txt", + Body=comparison_report, + ) + else: + # update comparison report with the data_type differences message + comparison_report = ( + f"There are no data types in common between {args.staging_namespace} and {args.main_namespace} to compare.\n" + f"{data_types_diff_msg}" ) - print(comparison_report) s3.put_object( Bucket=args.parquet_bucket, - Key=f"{s3_folder_path}/logs/parquet_comparison_report.txt", + Key=f"{args.parquet_bucket}/{args.staging_namespace}/comparison_result/data_types_compare.txt", Body=comparison_report, ) return diff --git a/tests/conftest.py b/tests/conftest.py index f405a585..4dfbb0c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ import boto3 import pytest import pandas as pd -from pyarrow import fs +from pyarrow import fs, parquet from moto import mock_s3 @@ -24,22 +24,40 @@ def s3(mock_aws_credentials): yield boto3.client("s3", region_name="us-east-1") +@pytest.fixture(scope="function") +def mock_aws_session(mock_aws_credentials): + with mock_s3(): + yield boto3.session.Session(region_name="us-east-1") + + @pytest.fixture() def parquet_bucket_name(): yield f"test-parquet-bucket" -@pytest.fixture -def s3_test_bucket(s3, parquet_bucket_name): +@pytest.fixture(scope="function") +def mock_s3_filesystem(mock_aws_session): with mock_s3(): - s3.create_bucket(Bucket=parquet_bucket_name) - yield + session_credentials = mock_aws_session.get_credentials() + yield fs.S3FileSystem( + region="us-east-1", + access_key=session_credentials.access_key, + secret_key=session_credentials.secret_key, + session_token=session_credentials.token) @pytest.fixture(scope="function") -def mock_s3_filesystem(mock_aws_credentials): - with mock_s3(): - yield fs.S3FileSystem(region="us-east-1") +def valid_staging_parquet_object(tmpdir_factory, valid_staging_dataset): + filename = str(tmpdir_factory.mktemp('data_folder').join('df.parquet')) + valid_staging_dataset.to_parquet(path=filename, engine="pyarrow") + data = parquet.read_table(filename) + yield data + + +@pytest.fixture() +def dataset_fixture(request): + """This allows us to use different fixtures for the same test""" + return request.getfixturevalue(request.param) @pytest.fixture() @@ -222,6 +240,7 @@ def staging_dataset_with_all_col_val_diff(): } ) + @pytest.fixture() def staging_dataset_with_empty_columns(): return pd.DataFrame( @@ -232,6 +251,7 @@ def staging_dataset_with_empty_columns(): } ) + @pytest.fixture() def staging_dataset_empty(): return pd.DataFrame() diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index feedfae5..f3d1f54e 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -1,9 +1,10 @@ from unittest import mock -import pandas as pd -from pyarrow import fs from moto import mock_s3 +import pandas as pd from pandas.testing import assert_frame_equal +import pytest +from pyarrow import fs, parquet from src.glue.jobs import compare_parquet_datasets as compare_parquet @@ -23,30 +24,43 @@ def test_that_get_duplicated_columns_returns_list_if_dup_exist( @mock_s3 -def test_that_get_parquet_dataset_returns_empty_if_no_datasets_exist( - s3, mock_s3_filesystem, valid_staging_dataset, parquet_bucket_name +def test_that_get_S3FileSystem_from_session_returns_filesystem_when_credentials_exist( + mock_aws_session, ): - data = valid_staging_dataset.to_parquet() - s3.create_bucket(Bucket=parquet_bucket_name) - s3.put_object( - Bucket=parquet_bucket_name, - Key="staging/parquet/dataset_fitbitactivitylogs/test.parquet", - Body=data, + filesystem = compare_parquet.get_S3FileSystem_from_session( + aws_session=mock_aws_session ) + assert isinstance(filesystem, fs.S3FileSystem) + +@mock_s3 +def test_that_get_parquet_dataset_raises_attr_error_if_no_datasets_exist( + s3, mock_s3_filesystem, parquet_bucket_name +): file_key = "staging/parquet/dataset_fitbitactivitylogs/test.parquet" - parquet_dataset = compare_parquet.get_parquet_dataset( - dataset_key=f"{parquet_bucket_name}/{file_key}", - s3_filesystem=mock_s3_filesystem, - ) - assert parquet_dataset == None + with mock.patch.object(parquet, "read_table", return_value=None) as mock_method: + with pytest.raises(AttributeError): + parquet_dataset = compare_parquet.get_parquet_dataset( + dataset_key=f"{parquet_bucket_name}/{file_key}", + s3_filesystem=mock_s3_filesystem, + ) @mock_s3 def test_that_get_parquet_dataset_returns_dataset_if_datasets_exist( - s3, mock_s3_filesystem, valid_staging_dataset, parquet_bucket_name + s3, mock_s3_filesystem, valid_staging_parquet_object, valid_staging_dataset, parquet_bucket_name ): - pass + file_key = "staging/parquet/dataset_fitbitactivitylogs/test.parquet" + with mock.patch.object(parquet, "read_table", return_value=valid_staging_parquet_object) as mock_method: + parquet_dataset = compare_parquet.get_parquet_dataset( + dataset_key=f"{parquet_bucket_name}/{file_key}", + s3_filesystem=mock_s3_filesystem, + ) + + assert_frame_equal( + parquet_dataset.reset_index(drop=True), + valid_staging_dataset.reset_index(drop=True), + ) @mock_s3 @@ -161,6 +175,49 @@ def test_that_compare_column_names_returns_msg_if_cols_are_diff( ] +@mock_s3 +def test_that_get_data_types_to_compare_returns_correct_datatypes_in_common( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + for namespace in ["staging", "main"]: + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"{namespace}/parquet/dataset_fitbitactivitylogs/test.txt", + ) + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"{namespace}/parquet/dataset_fitbitdevices/test.txt", + ) + + data_types = compare_parquet.get_data_types_to_compare( + s3, parquet_bucket_name, staging_namespace="staging", main_namespace="main" + ) + assert set(data_types) == set( + ["dataset_fitbitdevices", "dataset_fitbitactivitylogs"] + ) + + +@mock_s3 +def test_that_get_data_types_to_compare_returns_empty_list_if_no_data_types_in_common( + s3, parquet_bucket_name +): + s3.create_bucket(Bucket=parquet_bucket_name) + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"staging/parquet/dataset_fitbitactivitylogs/test.txt", + ) + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"main/parquet/dataset_fitbitdevices/test.txt", + ) + + data_types = compare_parquet.get_data_types_to_compare( + s3, parquet_bucket_name, staging_namespace="staging", main_namespace="main" + ) + assert data_types == [] + + @mock_s3 def test_that_compare_dataset_data_types_returns_empty_msg_if_datatypes_are_equal( s3, parquet_bucket_name @@ -169,7 +226,7 @@ def test_that_compare_dataset_data_types_returns_empty_msg_if_datatypes_are_equa for namespace in ["staging", "main"]: s3.put_object( Bucket=parquet_bucket_name, - Key=f"{namespace}/parquet/dataset_fitbitactivitylogs", + Key=f"{namespace}/parquet/dataset_fitbitactivitylogs/test.txt", ) compare_msg = compare_parquet.compare_dataset_data_types( s3, parquet_bucket_name, staging_namespace="staging", main_namespace="main" @@ -205,17 +262,15 @@ def test_that_compare_dataset_data_types_returns_msg_if_datatypes_are_not_equal( def test_that_is_valid_dataset_returns_true_if_dataset_is_valid(valid_staging_dataset): is_valid_result = compare_parquet.is_valid_dataset(valid_staging_dataset, "staging") - assert ( - is_valid_result["result"] is True - and is_valid_result["msg"] == "staging dataset has been validated." - ) + assert is_valid_result["result"] + assert is_valid_result["msg"] == "staging dataset has been validated." def test_that_is_valid_dataset_returns_false_if_dataset_is_empty(staging_dataset_empty): is_valid_result = compare_parquet.is_valid_dataset(staging_dataset_empty, "staging") + assert is_valid_result["result"] is False assert ( - is_valid_result["result"] is False - and is_valid_result["msg"] + is_valid_result["msg"] == "staging dataset has no data. Comparison cannot continue." ) @@ -226,7 +281,8 @@ def test_that_is_valid_dataset_returns_false_if_dataset_has_dup_cols( is_valid_result = compare_parquet.is_valid_dataset( staging_dataset_with_dup_cols, "staging" ) - assert is_valid_result["result"] is False and is_valid_result["msg"] == ( + assert is_valid_result["result"] is False + assert is_valid_result["msg"] == ( "staging dataset has duplicated columns. Comparison cannot continue.\n" "Duplicated columns:['EndDate']" ) @@ -238,23 +294,27 @@ def test_that_is_valid_dataset_returns_true_if_dataset_has_empty_cols( is_valid_result = compare_parquet.is_valid_dataset( staging_dataset_with_empty_columns, "staging" ) - assert ( - is_valid_result["result"] is True - and is_valid_result["msg"] == "staging dataset has been validated." - ) + assert is_valid_result["result"] + assert is_valid_result["msg"] == "staging dataset has been validated." -def test_that_compare_datasets_and_output_report_outputs_something_if_input_is_valid( - valid_staging_dataset, valid_main_dataset +@pytest.mark.parametrize( + "dataset_fixture", + ["staging_dataset_with_empty_columns", "valid_staging_dataset"], + indirect=True, +) +def test_that_compare_datasets_and_output_report_outputs_nonempty_str_if_input_is_valid( + dataset_fixture, valid_main_dataset ): comparison_report = compare_parquet.compare_datasets_and_output_report( "dataset_fitbitactivitylogs", - valid_staging_dataset, + dataset_fixture, valid_main_dataset, "staging", "main", ) - assert comparison_report is not False + assert isinstance(comparison_report, str) + assert comparison_report def test_that_add_additional_msg_to_comparison_report_outputs_correct_updated_msg(): From c00090b7fc370a9b3e2d2f140f97913dfe96ff6a Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 17 Apr 2023 14:46:40 -0700 Subject: [PATCH 06/18] add tests for compare_datasets_by_data_type, add a small validation func for input args, update syntax and make func params more robust, clean up string formatting, move s3 file path def to function --- src/glue/jobs/compare_parquet_datasets.py | 132 +++++++++++++++------- tests/conftest.py | 2 +- tests/test_compare_parquet_datasets.py | 115 ++++++++++++++++++- 3 files changed, 203 insertions(+), 46 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 090f8d02..5be43ade 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -45,22 +45,56 @@ def read_args(): parser.add_argument( "--staging-namespace", required=True, + type = validate_args, help="The name of the staging namespace to use", + default="staging" ) parser.add_argument( "--main-namespace", required=True, + type = validate_args, help=("The name of the main namespace to use"), + default="main" ) parser.add_argument( "--parquet-bucket", required=True, + type = validate_args, help=("The name of the S3 bucket containing the S3 files to compare"), + default="recover-dev-processed_data" ) args = parser.parse_args() return args +def validate_args(value : str) -> str: + """Checks to make sure none of the input command line arguments are empty strings + + Args: + value (str): the value of the command line argument parsed by argparse + + Raises: + argparse.ArgumentTypeError: when value is an empty string + + Returns: + str: the value as is + """ + if value == "": + raise argparse.ArgumentTypeError('Argument value cannot be an empty string') + return value + + +def get_s3_file_key_for_comparison_results( + parquet_bucket: str, staging_namespace: str, data_type: bool = None +) -> str: + """Gets the s3 file key for saving the comparison results to""" + s3_folder_prefix = f"{parquet_bucket}/{staging_namespace}/comparison_result" + if data_type: + return f"{s3_folder_prefix}/{data_type}_parquet_compare.txt" + else: + return f"{s3_folder_prefix}/data_types_compare.txt" + + def get_duplicated_columns(dataset: pd.DataFrame) -> list: """Gets a list of duplicated columns in a dataframe""" return dataset.columns[dataset.columns.duplicated()].tolist() @@ -242,7 +276,7 @@ def compare_datasets_and_output_report( def add_additional_msg_to_comparison_report( - comparison_report: str, add_msgs: list + comparison_report: str, add_msgs: list, msg_type: str ) -> str: """This adds additional messages to the comparison report. Currently, this adds messages that specify the names of columns that are different between the @@ -251,17 +285,30 @@ def add_additional_msg_to_comparison_report( Args: comparison_report (str): report generated using datacompy add_msgs (list): list of additional messages to include at the bottom of the report + msg_type (str): category of message, current available ones are + ["column_name_diff", "data_type_diff"] Returns: str: updated comparison report with more specific messages """ # does some formatting. joined_add_msgs = "\n".join(add_msgs) - updated_comparison_report = ( - f"{comparison_report}" - f"Column Name Differences\n" - f"-----------------------\n\n{joined_add_msgs}" - ) + if msg_type == "column_name_diff": + updated_comparison_report = ( + f"{comparison_report}" + f"Column Name Differences\n" + f"-----------------------\n\n{joined_add_msgs}" + ) + elif msg_type == "data_type_diff": + updated_comparison_report = ( + f"{comparison_report}" + f"Data Type Differences between the namespaces\n" + f"--------------------------------------------\n\n{joined_add_msgs}" + ) + else: + raise ValueError( + "msg_type param must be one of 'column_name_diff', 'data_type_diff'" + ) return updated_comparison_report @@ -298,7 +345,6 @@ def compare_datasets_by_data_type( staging_namespace: str, main_namespace: str, s3_filesystem: fs.S3FileSystem, - data_type_s3_folder_path: str, data_type: str, ) -> str: """This runs the bulk of the comparison functions from beginning to end by data type @@ -308,8 +354,6 @@ def compare_datasets_by_data_type( staging_namespace (str): name of namespace containing the "new" data main_namespace (str): name of namespace containing the "established" data s3_filesystem (fs.S3FileSystem): filesystem instantiated by aws credentials - data_type_s3_folder_path (str): path to the dataset's data type folder to read in - datasets for comparison data_type (str): data type to be compared for the given datasets Returns: @@ -334,35 +378,34 @@ def compare_datasets_by_data_type( staging_is_valid_result["result"] == False or main_is_valid_result["result"] == False ): - comparison_report = f"{header_msg}{staging_is_valid_result['msg']}\n{main_is_valid_result['msg']}" - return comparison_report - + comparison_report = ( + f"{staging_is_valid_result['msg']}\n{main_is_valid_result['msg']}" + ) # check that they have columns in common to compare elif not has_common_cols(staging_dataset, main_dataset): comparison_report = ( - f"{header_msg}{staging_namespace} dataset and {main_namespace} have no columns in common." + f"{staging_namespace} dataset and {main_namespace} have no columns in common." f" Comparison cannot continue." ) - return comparison_report else: comparison_report = compare_datasets_and_output_report( - data_type, - staging_dataset, - main_dataset, - staging_namespace, - main_namespace, + data_type=data_type, + staging_dataset=staging_dataset, + main_dataset=main_dataset, + staging_namespace=staging_namespace, + main_namespace=main_namespace, ) - comparison_report = f"{header_msg}{comparison_report}" + comparison_report = add_additional_msg_to_comparison_report( comparison_report, add_msgs=compare_column_names(data_type, staging_dataset, main_dataset), + msg_type="column_name_diff", ) - return comparison_report + return f"{header_msg}{comparison_report}" def main(): args = read_args() - comparison_result = {} s3 = boto3.client("s3") aws_session = boto3.session.Session(profile_name="default", region_name="us-east-1") fs = get_S3FileSystem_from_session(aws_session) @@ -379,42 +422,49 @@ def main(): main_namespace=args.main_namespace, staging_namespace=args.staging_namespace, ) - data_types_diff_msg = "\n".join(data_types_diff) if data_types_to_compare: for data_type in data_types_to_compare: - s3_folder_path = ( - f"{args.parquet_bucket}/{args.staging_namespace}/parquet/{data_type}/" - ) comparison_report = compare_datasets_by_data_type( - args.parquet_bucket, - args.staging_namespace, - args.main_namespace, - fs, - s3_folder_path, - data_type, + parquet_bucket=args.parquet_bucket, + staging_namespace=args.staging_namespace, + main_namespace=args.main_namespace, + s3_filesystem=fs, + data_type=data_type, ) + # update comparison report with the data_type differences message - comparison_report = ( - f"{comparison_report}\n\n" - f"Data Type Differences between {args.staging_namespace} and {args.main_namespace}:\n" - f"{data_types_diff_msg}" + comparison_report = add_additional_msg_to_comparison_report( + comparison_report, + add_msgs=data_types_diff, + msg_type="data_type_diff", ) print(comparison_report) + import pdb; pdb.set_trace() # save comparison report to report folder in staging namespace s3.put_object( Bucket=args.parquet_bucket, - Key=f"{args.parquet_bucket}/{args.staging_namespace}/comparison_result/{data_type}_parquet_compare.txt", + Key=get_s3_file_key_for_comparison_results( + parquet_bucket=args.parquet_bucket, + staging_namespace=args.staging_namespace, + data_type=data_type, + ), Body=comparison_report, ) else: # update comparison report with the data_type differences message - comparison_report = ( - f"There are no data types in common between {args.staging_namespace} and {args.main_namespace} to compare.\n" - f"{data_types_diff_msg}" + comparison_report = add_additional_msg_to_comparison_report( + comparison_report, + add_msgs=data_types_diff, + msg_type="data_type_diff", ) + print(comparison_report) s3.put_object( Bucket=args.parquet_bucket, - Key=f"{args.parquet_bucket}/{args.staging_namespace}/comparison_result/data_types_compare.txt", + Key=get_s3_file_key_for_comparison_results( + parquet_bucket=args.parquet_bucket, + staging_namespace=args.staging_namespace, + data_type=None, + ), Body=comparison_report, ) return diff --git a/tests/conftest.py b/tests/conftest.py index 4dfbb0c7..7ea91b59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,7 +32,7 @@ def mock_aws_session(mock_aws_credentials): @pytest.fixture() def parquet_bucket_name(): - yield f"test-parquet-bucket" + yield "test-parquet-bucket" @pytest.fixture(scope="function") diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index f3d1f54e..aa38fb03 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -1,3 +1,4 @@ +import argparse from unittest import mock from moto import mock_s3 @@ -9,6 +10,39 @@ from src.glue.jobs import compare_parquet_datasets as compare_parquet +def test_that_validate_args_raises_exception_when_input_value_is_empty_string(): + with pytest.raises(argparse.ArgumentTypeError): + compare_parquet.validate_args(value="") + + +def test_that_validate_args_returns_value_when_value_is_not_an_empty_string(): + assert compare_parquet.validate_args(value="TEST") == "TEST" + + +def test_that_get_s3_file_key_for_comparison_results_returns_correct_filepath_for_data_types_compare( + parquet_bucket_name, +): + file_key = compare_parquet.get_s3_file_key_for_comparison_results( + parquet_bucket_name, "staging", data_type=None + ) + assert ( + file_key + == "test-parquet-bucket/staging/comparison_result/data_types_compare.txt" + ) + + +def test_that_get_s3_file_key_for_comparison_results_has_expected_filepath_for_specific_data_type( + parquet_bucket_name, +): + file_key = compare_parquet.get_s3_file_key_for_comparison_results( + parquet_bucket_name, "staging", data_type="dataset_fitbitactivitylogs" + ) + assert ( + file_key + == "test-parquet-bucket/staging/comparison_result/dataset_fitbitactivitylogs_parquet_compare.txt" + ) + + def test_that_get_duplicated_columns_returns_empty_if_no_dup_exist( valid_staging_dataset, ): @@ -48,10 +82,16 @@ def test_that_get_parquet_dataset_raises_attr_error_if_no_datasets_exist( @mock_s3 def test_that_get_parquet_dataset_returns_dataset_if_datasets_exist( - s3, mock_s3_filesystem, valid_staging_parquet_object, valid_staging_dataset, parquet_bucket_name + s3, + mock_s3_filesystem, + valid_staging_parquet_object, + valid_staging_dataset, + parquet_bucket_name, ): file_key = "staging/parquet/dataset_fitbitactivitylogs/test.parquet" - with mock.patch.object(parquet, "read_table", return_value=valid_staging_parquet_object) as mock_method: + with mock.patch.object( + parquet, "read_table", return_value=valid_staging_parquet_object + ) as mock_method: parquet_dataset = compare_parquet.get_parquet_dataset( dataset_key=f"{parquet_bucket_name}/{file_key}", s3_filesystem=mock_s3_filesystem, @@ -317,14 +357,81 @@ def test_that_compare_datasets_and_output_report_outputs_nonempty_str_if_input_i assert comparison_report -def test_that_add_additional_msg_to_comparison_report_outputs_correct_updated_msg(): +def test_that_add_additional_msg_to_comparison_report_returns_correct_updated_msg(): comparison_report = "some string\n\n" add_msgs = ["one message", "two message"] result = compare_parquet.add_additional_msg_to_comparison_report( - comparison_report, add_msgs + comparison_report, add_msgs, msg_type="column_name_diff" ) assert result == ( "some string\n\nColumn Name Differences\n" "-----------------------\n\n" "one message\ntwo message" ) + + +def test_that_add_additional_msg_to_comparison_report_throws_error_if_msg_type_not_valid(): + comparison_report = "some string\n\n" + add_msgs = ["one message", "two message"] + with pytest.raises(ValueError): + result = compare_parquet.add_additional_msg_to_comparison_report( + comparison_report, add_msgs, msg_type="invalid_msg_type" + ) + + +def test_that_compare_datasets_by_data_type_returns_no_data_msg_if_input_is_empty( + parquet_bucket_name, staging_dataset_empty +): + with mock.patch( + "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", + return_value=staging_dataset_empty, + ) as mock_parquet: + compare_msg = compare_parquet.compare_datasets_by_data_type( + parquet_bucket=parquet_bucket_name, + staging_namespace="staging", + main_namespace="main", + s3_filesystem=None, + data_type="dataset_fitbitactivitylogs", + ) + assert compare_msg == ( + "\n\nParquet Dataset Comparison running for Data Type: dataset_fitbitactivitylogs\n" + "-----------------------------------------------------------------\n\n" + "staging dataset has no data. Comparison cannot continue.\n" + "main dataset has no data. Comparison cannot continue." + ) + + +@mock.patch("src.glue.jobs.compare_parquet_datasets.compare_datasets_and_output_report") +def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_if_input_is_valid( + mocked_compare_datasets, parquet_bucket_name, valid_staging_dataset +): + with mock.patch( + "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", + return_value=valid_staging_dataset, + ) as mock_parquet: + compare_parquet.compare_datasets_by_data_type( + parquet_bucket=parquet_bucket_name, + staging_namespace="staging", + main_namespace="main", + s3_filesystem=None, + data_type="dataset_fitbitactivitylogs", + ) + mocked_compare_datasets.assert_called_once() + + +@mock.patch("src.glue.jobs.compare_parquet_datasets.compare_datasets_and_output_report") +def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_if_input_is_valid( + mocked_compare_datasets, parquet_bucket_name, valid_staging_dataset +): + with mock.patch( + "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", + return_value=valid_staging_dataset, + ) as mock_parquet: + compare_parquet.compare_datasets_by_data_type( + parquet_bucket=parquet_bucket_name, + staging_namespace="staging", + main_namespace="main", + s3_filesystem=None, + data_type="dataset_fitbitactivitylogs", + ) + mocked_compare_datasets.assert_called_once() From 6f9415eec922817792ba1c3d84d61be759fb177c Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 17 Apr 2023 15:04:50 -0700 Subject: [PATCH 07/18] remove debugging, add test for checking has_common_cols logic in compare_datasets_by_data_type --- src/glue/jobs/compare_parquet_datasets.py | 1 - tests/test_compare_parquet_datasets.py | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 5be43ade..94314a1e 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -439,7 +439,6 @@ def main(): msg_type="data_type_diff", ) print(comparison_report) - import pdb; pdb.set_trace() # save comparison report to report folder in staging namespace s3.put_object( Bucket=args.parquet_bucket, diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index aa38fb03..af501276 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -379,7 +379,7 @@ def test_that_add_additional_msg_to_comparison_report_throws_error_if_msg_type_n ) -def test_that_compare_datasets_by_data_type_returns_no_data_msg_if_input_is_empty( +def test_that_compare_datasets_by_data_type_returns_correct_msg_if_input_is_empty( parquet_bucket_name, staging_dataset_empty ): with mock.patch( @@ -420,8 +420,9 @@ def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_ @mock.patch("src.glue.jobs.compare_parquet_datasets.compare_datasets_and_output_report") -def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_if_input_is_valid( - mocked_compare_datasets, parquet_bucket_name, valid_staging_dataset +@mock.patch("src.glue.jobs.compare_parquet_datasets.has_common_cols", return_value = False) +def test_that_compare_datasets_by_data_type_does_not_call_compare_datasets_by_data_type_if_input_has_no_common_cols( + mocked_has_common_cols, mocked_compare_datasets, parquet_bucket_name, valid_staging_dataset ): with mock.patch( "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", @@ -434,4 +435,4 @@ def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_ s3_filesystem=None, data_type="dataset_fitbitactivitylogs", ) - mocked_compare_datasets.assert_called_once() + mocked_compare_datasets.assert_not_called() From 1daffef8e2f86412806607c6a35ff6cec79ff7ca Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 17 Apr 2023 20:09:07 -0700 Subject: [PATCH 08/18] add os.path.join for more robustness --- src/glue/jobs/compare_parquet_datasets.py | 37 +++++++++++++++-------- tests/test_compare_parquet_datasets.py | 9 ++++++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 94314a1e..6f3383f5 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -45,29 +45,29 @@ def read_args(): parser.add_argument( "--staging-namespace", required=True, - type = validate_args, + type=validate_args, help="The name of the staging namespace to use", - default="staging" + default="staging", ) parser.add_argument( "--main-namespace", required=True, - type = validate_args, + type=validate_args, help=("The name of the main namespace to use"), - default="main" + default="main", ) parser.add_argument( "--parquet-bucket", required=True, - type = validate_args, + type=validate_args, help=("The name of the S3 bucket containing the S3 files to compare"), - default="recover-dev-processed_data" + default="recover-dev-processed_data", ) args = parser.parse_args() return args -def validate_args(value : str) -> str: +def validate_args(value: str) -> str: """Checks to make sure none of the input command line arguments are empty strings Args: @@ -80,7 +80,7 @@ def validate_args(value : str) -> str: str: the value as is """ if value == "": - raise argparse.ArgumentTypeError('Argument value cannot be an empty string') + raise argparse.ArgumentTypeError("Argument value cannot be an empty string") return value @@ -88,11 +88,18 @@ def get_s3_file_key_for_comparison_results( parquet_bucket: str, staging_namespace: str, data_type: bool = None ) -> str: """Gets the s3 file key for saving the comparison results to""" - s3_folder_prefix = f"{parquet_bucket}/{staging_namespace}/comparison_result" + s3_folder_prefix = os.path.join( + parquet_bucket, staging_namespace, "comparison_result" + ) if data_type: - return f"{s3_folder_prefix}/{data_type}_parquet_compare.txt" + return os.path.join(s3_folder_prefix, f"{data_type}_parquet_compare.txt") else: - return f"{s3_folder_prefix}/data_types_compare.txt" + return os.path.join(s3_folder_prefix, "data_types_compare.txt") + + +def get_parquet_dataset_s3_path(parquet_bucket: str, namespace: str, data_type: str): + """Gets the s3 filepath to the parquet datasets""" + return os.path.join("s3://", parquet_bucket, namespace, "parquet", data_type) def get_duplicated_columns(dataset: pd.DataFrame) -> list: @@ -364,11 +371,15 @@ def compare_datasets_by_data_type( f"\n-----------------------------------------------------------------\n\n" ) staging_dataset = get_parquet_dataset( - dataset_key=f"s3://{parquet_bucket}/{staging_namespace}/parquet/{data_type}/", + dataset_key=get_parquet_dataset_s3_path( + parquet_bucket, staging_namespace, data_type + ), s3_filesystem=s3_filesystem, ) main_dataset = get_parquet_dataset( - dataset_key=f"s3://{parquet_bucket}/{main_namespace}/parquet/{data_type}/", + dataset_key=get_parquet_dataset_s3_path( + parquet_bucket, main_namespace, data_type + ), s3_filesystem=s3_filesystem, ) # go through specific validation for each dataset prior to comparison diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index af501276..eaf370cb 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -19,6 +19,15 @@ def test_that_validate_args_returns_value_when_value_is_not_an_empty_string(): assert compare_parquet.validate_args(value="TEST") == "TEST" +def test_that_get_parquet_dataset_s3_path_returns_correct_filepath( + parquet_bucket_name +): + filepath = compare_parquet.get_parquet_dataset_s3_path( + parquet_bucket_name, "test_namespace", "dataset_fitbitactivitylogs" + ) + assert filepath == "s3://test-parquet-bucket/test_namespace/parquet/dataset_fitbitactivitylogs" + + def test_that_get_s3_file_key_for_comparison_results_returns_correct_filepath_for_data_types_compare( parquet_bucket_name, ): From 91e646eb21c9434949ea6b4e509120f934d638cb Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 20 Apr 2023 10:44:55 -0700 Subject: [PATCH 09/18] add additional printouts of entire missing rows, add rows, mismatch cols, adjust tests --- src/glue/jobs/compare_parquet_datasets.py | 89 +++++++++++++++++++---- tests/test_compare_parquet_datasets.py | 42 ++++++----- 2 files changed, 97 insertions(+), 34 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 6f3383f5..f973a320 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -85,16 +85,20 @@ def validate_args(value: str) -> str: def get_s3_file_key_for_comparison_results( - parquet_bucket: str, staging_namespace: str, data_type: bool = None + staging_namespace: str, data_type: str = None, file_name: str = "" ) -> str: - """Gets the s3 file key for saving the comparison results to""" - s3_folder_prefix = os.path.join( - parquet_bucket, staging_namespace, "comparison_result" - ) - if data_type: - return os.path.join(s3_folder_prefix, f"{data_type}_parquet_compare.txt") + """Gets the s3 file key for saving the comparison results to + Note that file_name should contain the suffix""" + s3_folder_prefix = os.path.join(staging_namespace, "comparison_result") + if file_name.endswith(".csv") or file_name.endswith(".txt"): + if data_type: + return os.path.join(s3_folder_prefix, f"{data_type}_{file_name}") + else: + return os.path.join(s3_folder_prefix, f"{file_name}") else: - return os.path.join(s3_folder_prefix, "data_types_compare.txt") + raise TypeError( + f"file_name {file_name} should contain one of the following file extensions: [.txt, .csv]" + ) def get_parquet_dataset_s3_path(parquet_bucket: str, namespace: str, data_type: str): @@ -194,6 +198,26 @@ def get_folders_in_s3_bucket( return folders +def compare_row_diffs(compare_obj: datacompy.Compare, namespace: str): + """Uses the datacompy Compare object to get all rows that are different + in each dataset + Args: + compare_obj (datacompy.Compare): _description_ + dataset (int): _description_ + """ + if namespace == "staging": + columns = compare_obj.df1_unq_rows.columns + rows = compare_obj.df1_unq_rows.sample(compare_obj.df1_unq_rows.shape[0])[ + columns + ] + elif namespace == "main": + columns = compare_obj.df2_unq_rows.columns + rows += compare_obj.df2_unq_rows.sample(compare_obj.df2_unq_rows.shape[0])[ + columns + ] + return rows + + def compare_column_names( data_type: str, staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: @@ -261,13 +285,13 @@ def compare_datasets_and_output_report( main_dataset: pd.DataFrame, staging_namespace: str, main_namespace: str, -) -> str: +) -> datacompy.Compare: """This method prints out a human-readable report summarizing and sampling differences between datasets for the given data type. A full list of comparisons can be found in the datacompy package site. Returns: - str: large string block of the report + datacompy.Compare: object containing """ compare = datacompy.Compare( df1=staging_dataset, @@ -279,7 +303,7 @@ def compare_datasets_and_output_report( df2_name=main_namespace, # Optional, defaults to 'df2' ) compare.matches(ignore_extra_columns=False) - return compare.report() + return compare def add_additional_msg_to_comparison_report( @@ -435,14 +459,14 @@ def main(): ) if data_types_to_compare: for data_type in data_types_to_compare: - comparison_report = compare_datasets_by_data_type( + compare = compare_datasets_by_data_type( parquet_bucket=args.parquet_bucket, staging_namespace=args.staging_namespace, main_namespace=args.main_namespace, s3_filesystem=fs, data_type=data_type, ) - + comparison_report = compare.report() # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( comparison_report, @@ -454,12 +478,47 @@ def main(): s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( - parquet_bucket=args.parquet_bucket, staging_namespace=args.staging_namespace, data_type=data_type, + file_name="_parquet_compare.txt", ), Body=comparison_report, ) + # additional print outs + mismatch_cols_report = compare.all_mismatch() + if mismatch_cols_report: + s3.put_object( + Bucket=args.parquet_bucket, + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args.staging_namespace, + data_type=data_type, + file_name="all_mismatch_cols.csv", + ), + Body=mismatch_cols_report, + ) + staging_rows_report = compare_row_diffs(compare, namespace="staging") + if staging_rows_report: + s3.put_object( + Bucket=args.parquet_bucket, + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args.staging_namespace, + data_type=data_type, + file_name="all_diff_staging_rows.csv", + ), + Body=staging_rows_report, + ) + main_rows_report = compare_row_diffs(compare, namespace="main") + if main_rows_report: + s3.put_object( + Bucket=args.parquet_bucket, + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args.staging_namespace, + data_type=data_type, + file_name="all_diff_main_rows.csv", + ), + Body=main_rows_report, + ) + else: # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( @@ -471,9 +530,9 @@ def main(): s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( - parquet_bucket=args.parquet_bucket, staging_namespace=args.staging_namespace, data_type=None, + file_name="data_types_compare.txt", ), Body=comparison_report, ) diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index eaf370cb..82b63d46 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -1,6 +1,7 @@ import argparse from unittest import mock +import datacompy from moto import mock_s3 import pandas as pd from pandas.testing import assert_frame_equal @@ -28,29 +29,31 @@ def test_that_get_parquet_dataset_s3_path_returns_correct_filepath( assert filepath == "s3://test-parquet-bucket/test_namespace/parquet/dataset_fitbitactivitylogs" -def test_that_get_s3_file_key_for_comparison_results_returns_correct_filepath_for_data_types_compare( - parquet_bucket_name, -): +def test_that_get_s3_file_key_for_comparison_results_returns_correct_filepath_for_data_types_compare(): file_key = compare_parquet.get_s3_file_key_for_comparison_results( - parquet_bucket_name, "staging", data_type=None + "staging", data_type=None, file_name = "data_types_compare.txt" ) assert ( file_key - == "test-parquet-bucket/staging/comparison_result/data_types_compare.txt" + == "staging/comparison_result/data_types_compare.txt" ) -def test_that_get_s3_file_key_for_comparison_results_has_expected_filepath_for_specific_data_type( - parquet_bucket_name, -): +def test_that_get_s3_file_key_for_comparison_results_has_expected_filepath_for_specific_data_type(): file_key = compare_parquet.get_s3_file_key_for_comparison_results( - parquet_bucket_name, "staging", data_type="dataset_fitbitactivitylogs" + "staging", data_type="dataset_fitbitactivitylogs", file_name = "parquet_compare.txt" ) assert ( file_key - == "test-parquet-bucket/staging/comparison_result/dataset_fitbitactivitylogs_parquet_compare.txt" + == "staging/comparison_result/dataset_fitbitactivitylogs_parquet_compare.txt" ) +def test_that_get_s3_file_key_for_comparison_results_raises_type_error_if_filename_has_wrong_file_ext(): + with pytest.raises(TypeError): + file_key = compare_parquet.get_s3_file_key_for_comparison_results( + "staging", data_type="dataset_fitbitactivitylogs", file_name = "parquet_compare.pdf" + ) + def test_that_get_duplicated_columns_returns_empty_if_no_dup_exist( valid_staging_dataset, @@ -352,18 +355,19 @@ def test_that_is_valid_dataset_returns_true_if_dataset_has_empty_cols( ["staging_dataset_with_empty_columns", "valid_staging_dataset"], indirect=True, ) -def test_that_compare_datasets_and_output_report_outputs_nonempty_str_if_input_is_valid( +def test_that_compare_datasets_and_output_report_outputs_datacompy_compare_obj_if_input_is_valid( dataset_fixture, valid_main_dataset ): - comparison_report = compare_parquet.compare_datasets_and_output_report( - "dataset_fitbitactivitylogs", - dataset_fixture, - valid_main_dataset, - "staging", - "main", + compare = compare_parquet.compare_datasets_and_output_report( + data_type="dataset_fitbitactivitylogs", + staging_dataset=dataset_fixture, + main_dataset=valid_main_dataset, + staging_namespace="staging", + main_namespace="main", ) - assert isinstance(comparison_report, str) - assert comparison_report + assert isinstance(compare, datacompy.Compare) + assert_frame_equal(compare.df1, dataset_fixture) + assert_frame_equal(compare.df2, valid_main_dataset) def test_that_add_additional_msg_to_comparison_report_returns_correct_updated_msg(): From a51baee9172b70b1f0bec7061c958b285b9737ba Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 20 Apr 2023 12:28:11 -0700 Subject: [PATCH 10/18] add function for converting csv to text for s3.put_object, add tests for getting all row diff, add linting, adjust function to output datacompy.Compare object to allow for more logs --- src/glue/jobs/compare_parquet_datasets.py | 48 +++++++--- tests/conftest.py | 9 +- tests/test_compare_parquet_datasets.py | 109 +++++++++++++++++++--- 3 files changed, 132 insertions(+), 34 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index f973a320..8cd32e9b 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -1,8 +1,10 @@ import argparse +from io import StringIO import json import os import logging + import boto3 import datacompy import pandas as pd @@ -131,6 +133,14 @@ def get_additional_cols( return add_cols +def convert_dataframe_to_text(dataset: pd.DataFrame) -> str: + """Converts a pandas DataFrame into a string to save as csv in S3""" + csv_buffer = StringIO() + dataset.to_csv(csv_buffer) + csv_content = csv_buffer.getvalue() + return csv_content + + def get_S3FileSystem_from_session( aws_session: boto3.session.Session, ) -> fs.S3FileSystem: @@ -212,7 +222,7 @@ def compare_row_diffs(compare_obj: datacompy.Compare, namespace: str): ] elif namespace == "main": columns = compare_obj.df2_unq_rows.columns - rows += compare_obj.df2_unq_rows.sample(compare_obj.df2_unq_rows.shape[0])[ + rows = compare_obj.df2_unq_rows.sample(compare_obj.df2_unq_rows.shape[0])[ columns ] return rows @@ -301,6 +311,7 @@ def compare_datasets_and_output_report( rel_tol=0, # Optional, defaults to 0 df1_name=staging_namespace, # Optional, defaults to 'df1' df2_name=main_namespace, # Optional, defaults to 'df2' + cast_column_names_lower=False, ) compare.matches(ignore_extra_columns=False) return compare @@ -377,7 +388,7 @@ def compare_datasets_by_data_type( main_namespace: str, s3_filesystem: fs.S3FileSystem, data_type: str, -) -> str: +) -> dict: """This runs the bulk of the comparison functions from beginning to end by data type Args: @@ -388,7 +399,9 @@ def compare_datasets_by_data_type( data_type (str): data type to be compared for the given datasets Returns: - str: final report on the datasets for the given data type + dict: + compare_obj: the datacompy.Compare obj on the two datasets + comparison_report:final report on the datasets for the given data type """ header_msg = ( f"\n\nParquet Dataset Comparison running for Data Type: {data_type}" @@ -416,14 +429,16 @@ def compare_datasets_by_data_type( comparison_report = ( f"{staging_is_valid_result['msg']}\n{main_is_valid_result['msg']}" ) + compare = None # check that they have columns in common to compare elif not has_common_cols(staging_dataset, main_dataset): comparison_report = ( f"{staging_namespace} dataset and {main_namespace} have no columns in common." f" Comparison cannot continue." ) + compare = None else: - comparison_report = compare_datasets_and_output_report( + compare = compare_datasets_and_output_report( data_type=data_type, staging_dataset=staging_dataset, main_dataset=main_dataset, @@ -432,11 +447,14 @@ def compare_datasets_by_data_type( ) comparison_report = add_additional_msg_to_comparison_report( - comparison_report, + compare.report(), add_msgs=compare_column_names(data_type, staging_dataset, main_dataset), msg_type="column_name_diff", ) - return f"{header_msg}{comparison_report}" + return { + "compare_obj": compare, + "comparison_report": f"{header_msg}{comparison_report}", + } def main(): @@ -459,17 +477,16 @@ def main(): ) if data_types_to_compare: for data_type in data_types_to_compare: - compare = compare_datasets_by_data_type( + compare_dict = compare_datasets_by_data_type( parquet_bucket=args.parquet_bucket, staging_namespace=args.staging_namespace, main_namespace=args.main_namespace, s3_filesystem=fs, data_type=data_type, ) - comparison_report = compare.report() # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( - comparison_report, + compare_dict["comparison_report"], add_msgs=data_types_diff, msg_type="data_type_diff", ) @@ -485,8 +502,9 @@ def main(): Body=comparison_report, ) # additional print outs + compare = compare_dict["compare_obj"] mismatch_cols_report = compare.all_mismatch() - if mismatch_cols_report: + if not mismatch_cols_report.empty: s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( @@ -494,10 +512,10 @@ def main(): data_type=data_type, file_name="all_mismatch_cols.csv", ), - Body=mismatch_cols_report, + Body=convert_dataframe_to_text(mismatch_cols_report), ) staging_rows_report = compare_row_diffs(compare, namespace="staging") - if staging_rows_report: + if not staging_rows_report.empty: s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( @@ -505,10 +523,10 @@ def main(): data_type=data_type, file_name="all_diff_staging_rows.csv", ), - Body=staging_rows_report, + Body=convert_dataframe_to_text(staging_rows_report), ) main_rows_report = compare_row_diffs(compare, namespace="main") - if main_rows_report: + if not main_rows_report.empty: s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( @@ -516,7 +534,7 @@ def main(): data_type=data_type, file_name="all_diff_main_rows.csv", ), - Body=main_rows_report, + Body=convert_dataframe_to_text(main_rows_report), ) else: diff --git a/tests/conftest.py b/tests/conftest.py index 7ea91b59..52373458 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,17 +38,18 @@ def parquet_bucket_name(): @pytest.fixture(scope="function") def mock_s3_filesystem(mock_aws_session): with mock_s3(): - session_credentials = mock_aws_session.get_credentials() - yield fs.S3FileSystem( + session_credentials = mock_aws_session.get_credentials() + yield fs.S3FileSystem( region="us-east-1", access_key=session_credentials.access_key, secret_key=session_credentials.secret_key, - session_token=session_credentials.token) + session_token=session_credentials.token, + ) @pytest.fixture(scope="function") def valid_staging_parquet_object(tmpdir_factory, valid_staging_dataset): - filename = str(tmpdir_factory.mktemp('data_folder').join('df.parquet')) + filename = str(tmpdir_factory.mktemp("data_folder").join("df.parquet")) valid_staging_dataset.to_parquet(path=filename, engine="pyarrow") data = parquet.read_table(filename) yield data diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index 82b63d46..5d96a3d5 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -20,38 +20,41 @@ def test_that_validate_args_returns_value_when_value_is_not_an_empty_string(): assert compare_parquet.validate_args(value="TEST") == "TEST" -def test_that_get_parquet_dataset_s3_path_returns_correct_filepath( - parquet_bucket_name -): +def test_that_get_parquet_dataset_s3_path_returns_correct_filepath(parquet_bucket_name): filepath = compare_parquet.get_parquet_dataset_s3_path( parquet_bucket_name, "test_namespace", "dataset_fitbitactivitylogs" ) - assert filepath == "s3://test-parquet-bucket/test_namespace/parquet/dataset_fitbitactivitylogs" + assert ( + filepath + == "s3://test-parquet-bucket/test_namespace/parquet/dataset_fitbitactivitylogs" + ) def test_that_get_s3_file_key_for_comparison_results_returns_correct_filepath_for_data_types_compare(): file_key = compare_parquet.get_s3_file_key_for_comparison_results( - "staging", data_type=None, file_name = "data_types_compare.txt" - ) - assert ( - file_key - == "staging/comparison_result/data_types_compare.txt" + "staging", data_type=None, file_name="data_types_compare.txt" ) + assert file_key == "staging/comparison_result/data_types_compare.txt" def test_that_get_s3_file_key_for_comparison_results_has_expected_filepath_for_specific_data_type(): file_key = compare_parquet.get_s3_file_key_for_comparison_results( - "staging", data_type="dataset_fitbitactivitylogs", file_name = "parquet_compare.txt" + "staging", + data_type="dataset_fitbitactivitylogs", + file_name="parquet_compare.txt", ) assert ( file_key == "staging/comparison_result/dataset_fitbitactivitylogs_parquet_compare.txt" ) + def test_that_get_s3_file_key_for_comparison_results_raises_type_error_if_filename_has_wrong_file_ext(): with pytest.raises(TypeError): file_key = compare_parquet.get_s3_file_key_for_comparison_results( - "staging", data_type="dataset_fitbitactivitylogs", file_name = "parquet_compare.pdf" + "staging", + data_type="dataset_fitbitactivitylogs", + file_name="parquet_compare.pdf", ) @@ -201,6 +204,77 @@ def test_that_get_additional_cols_returns_list_of_cols_if_add_cols( assert test_add_cols == ["AverageHeartRate"] +def test_that_dataframe_to_text_returns_str(valid_staging_dataset): + staging_content = compare_parquet.convert_dataframe_to_text(valid_staging_dataset) + assert isinstance(staging_content, str) + + +def test_that_dataframe_to_text_returns_valid_format_for_s3_put_object( + s3, parquet_bucket_name, valid_staging_dataset +): + # shouldn't throw a botocore.exceptions.ParamValidationError + s3.create_bucket(Bucket=parquet_bucket_name) + staging_content = compare_parquet.convert_dataframe_to_text(valid_staging_dataset) + s3.put_object( + Bucket=parquet_bucket_name, + Key=f"staging/parquet/dataset_fitbitactivitylogs/test.csv", + Body=staging_content, + ) + + +def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( + valid_staging_dataset, valid_main_dataset +): + compare = datacompy.Compare( + df1=valid_staging_dataset, + df2=valid_main_dataset, + join_columns="LogId", + df1_name="staging", # Optional, defaults to 'df1' + df2_name="main", # Optional, defaults to 'df2' + ) + staging_rows = compare_parquet.compare_row_diffs(compare, namespace="staging") + main_rows = compare_parquet.compare_row_diffs(compare, namespace="main") + assert staging_rows.empty + assert main_rows.empty + + +def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( + staging_dataset_with_diff_num_of_rows, valid_main_dataset +): + compare = datacompy.Compare( + df1=staging_dataset_with_diff_num_of_rows, + df2=valid_main_dataset, + join_columns="LogId", + df1_name="staging", # Optional, defaults to 'df1' + df2_name="main", # Optional, defaults to 'df2' + cast_column_names_lower=False, + ) + staging_rows = compare_parquet.compare_row_diffs(compare, namespace="staging") + main_rows = compare_parquet.compare_row_diffs(compare, namespace="main") + assert staging_rows.empty + assert_frame_equal( + main_rows.reset_index(drop=True), + pd.DataFrame( + { + "LogId": [ + "46096730542", + "51739302864", + ], + "StartDate": [ + "2022-02-18T08:26:54+00:00", + "2022-10-28T11:58:50+00:00", + ], + "EndDate": [ + "2022-02-18T09:04:30+00:00", + "2022-10-28T12:35:38+00:00", + ], + "ActiveDuration": ["2256000", "2208000"], + "Calories": ["473", "478"], + } + ).reset_index(drop=True), + ) + + def test_that_compare_column_names_returns_empty_msg_if_cols_are_same( valid_staging_dataset, valid_main_dataset ): @@ -399,14 +473,14 @@ def test_that_compare_datasets_by_data_type_returns_correct_msg_if_input_is_empt "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", return_value=staging_dataset_empty, ) as mock_parquet: - compare_msg = compare_parquet.compare_datasets_by_data_type( + compare_dict = compare_parquet.compare_datasets_by_data_type( parquet_bucket=parquet_bucket_name, staging_namespace="staging", main_namespace="main", s3_filesystem=None, data_type="dataset_fitbitactivitylogs", ) - assert compare_msg == ( + assert compare_dict["comparison_report"] == ( "\n\nParquet Dataset Comparison running for Data Type: dataset_fitbitactivitylogs\n" "-----------------------------------------------------------------\n\n" "staging dataset has no data. Comparison cannot continue.\n" @@ -433,9 +507,14 @@ def test_that_compare_datasets_by_data_type_calls_compare_datasets_by_data_type_ @mock.patch("src.glue.jobs.compare_parquet_datasets.compare_datasets_and_output_report") -@mock.patch("src.glue.jobs.compare_parquet_datasets.has_common_cols", return_value = False) +@mock.patch( + "src.glue.jobs.compare_parquet_datasets.has_common_cols", return_value=False +) def test_that_compare_datasets_by_data_type_does_not_call_compare_datasets_by_data_type_if_input_has_no_common_cols( - mocked_has_common_cols, mocked_compare_datasets, parquet_bucket_name, valid_staging_dataset + mocked_has_common_cols, + mocked_compare_datasets, + parquet_bucket_name, + valid_staging_dataset, ): with mock.patch( "src.glue.jobs.compare_parquet_datasets.get_parquet_dataset", From ba124764deb1d5e103385d1792745a2c23d56884 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 20 Apr 2023 13:53:10 -0700 Subject: [PATCH 11/18] add duplicates report, add TODOs, add more robust tests, re-org saving of reports --- src/glue/jobs/compare_parquet_datasets.py | 99 ++++++++++++++++++++--- tests/conftest.py | 4 +- tests/test_compare_parquet_datasets.py | 53 +++++++++++- 3 files changed, 144 insertions(+), 12 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 8cd32e9b..0a08f9c6 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -90,11 +90,16 @@ def get_s3_file_key_for_comparison_results( staging_namespace: str, data_type: str = None, file_name: str = "" ) -> str: """Gets the s3 file key for saving the comparison results to - Note that file_name should contain the suffix""" + Note that file_name should contain the file extension. + + NOTE: When using s3.put_object, if the bucket name is part of the the + Key parameter where we put the filepath to the file to save, it will + just assume it's another folder and will create another folder inside the + bucket""" s3_folder_prefix = os.path.join(staging_namespace, "comparison_result") if file_name.endswith(".csv") or file_name.endswith(".txt"): if data_type: - return os.path.join(s3_folder_prefix, f"{data_type}_{file_name}") + return os.path.join(s3_folder_prefix, data_type, f"{file_name}") else: return os.path.join(s3_folder_prefix, f"{file_name}") else: @@ -134,7 +139,10 @@ def get_additional_cols( def convert_dataframe_to_text(dataset: pd.DataFrame) -> str: - """Converts a pandas DataFrame into a string to save as csv in S3""" + """Converts a pandas DataFrame into a string to save as csv in S3 + + NOTE: This is because s3.put_object only allows bytes or string + objects when saving them to S3""" csv_buffer = StringIO() dataset.to_csv(csv_buffer) csv_content = csv_buffer.getvalue() @@ -174,6 +182,10 @@ def get_parquet_dataset( Returns: pandas.DataFrame + + TODO: Currently, internal pyarrow things like to_table as a + result of the read_table function below takes a while as the dataset + grows bigger. Could find a way to optimize that. """ table_source = dataset_key.split("s3://")[-1] parquet_dataset = pq.read_table(source=table_source, filesystem=s3_filesystem) @@ -208,12 +220,37 @@ def get_folders_in_s3_bucket( return folders -def compare_row_diffs(compare_obj: datacompy.Compare, namespace: str): +def get_duplicates(compare_obj: datacompy.Compare, namespace: str) -> pd.DataFrame: + """Uses the datacompy Compare object to get all duplicates for a given dataset + Args: + compare_obj (datacompy.Compare): compare object that was defined earlier + namespace (str): The dataset we want to get the duplicated rows from + + Returns: + pd.DataFrame: All the duplicated rows + """ + if namespace == "staging": + dup_rows = compare_obj.df1[ + compare_obj.df1.duplicated(subset=compare_obj.join_columns) + ] + elif namespace == "main": + dup_rows = compare_obj.df2[ + compare_obj.df2.duplicated(subset=compare_obj.join_columns) + ] + else: + raise KeyError("namespace can only be one of 'staging', 'main'") + return dup_rows + + +def compare_row_diffs(compare_obj: datacompy.Compare, namespace: str) -> pd.DataFrame: """Uses the datacompy Compare object to get all rows that are different in each dataset Args: - compare_obj (datacompy.Compare): _description_ - dataset (int): _description_ + compare_obj (datacompy.Compare): compare object that was defined earlier + namespace (str): The dataset we want to get the rows that are different from + + Returns: + pd.DataFrame: All the rows that's in one dataframe but not the other """ if namespace == "staging": columns = compare_obj.df1_unq_rows.columns @@ -225,6 +262,8 @@ def compare_row_diffs(compare_obj: datacompy.Compare, namespace: str): rows = compare_obj.df2_unq_rows.sample(compare_obj.df2_unq_rows.shape[0])[ columns ] + else: + raise KeyError("namespace can only be one of 'staging', 'main'") return rows @@ -302,6 +341,11 @@ def compare_datasets_and_output_report( Returns: datacompy.Compare: object containing + + TODO: Look into using datacompy.SparkCompare as in the docs, it mentions + it works with data that is partitioned Parquet, CSV, or JSON files, + or Cerebro tables. This will also likely be necessary once we have + more data coming in over weeks and months """ compare = datacompy.Compare( df1=staging_dataset, @@ -490,19 +534,24 @@ def main(): add_msgs=data_types_diff, msg_type="data_type_diff", ) - print(comparison_report) # save comparison report to report folder in staging namespace s3.put_object( Bucket=args.parquet_bucket, Key=get_s3_file_key_for_comparison_results( staging_namespace=args.staging_namespace, data_type=data_type, - file_name="_parquet_compare.txt", + file_name="parquet_compare.txt", ), Body=comparison_report, ) - # additional print outs + logger.info("Comparison report saved!") + # additional report print outs compare = compare_dict["compare_obj"] + # TODO: Find out if pandas.to_csv, or direct write to S3 + # is more efficient. s3.put_object is very slow and memory heavy + # esp. if using StringIO conversion + + # print out all mismatch columns mismatch_cols_report = compare.all_mismatch() if not mismatch_cols_report.empty: s3.put_object( @@ -514,6 +563,8 @@ def main(): ), Body=convert_dataframe_to_text(mismatch_cols_report), ) + logger.info("Mismatch columns saved!") + # print out all staging rows that are different to main staging_rows_report = compare_row_diffs(compare, namespace="staging") if not staging_rows_report.empty: s3.put_object( @@ -525,6 +576,8 @@ def main(): ), Body=convert_dataframe_to_text(staging_rows_report), ) + logger.info("Different staging dataset rows saved!") + # print out all main rows that are different to staging main_rows_report = compare_row_diffs(compare, namespace="main") if not main_rows_report.empty: s3.put_object( @@ -536,7 +589,34 @@ def main(): ), Body=convert_dataframe_to_text(main_rows_report), ) + logger.info("Different main dataset rows saved!") + # print out all staging duplicated rows + staging_dups_report = get_duplicates(compare, namespace ="staging") + if not staging_dups_report.empty: + s3.put_object( + Bucket=args.parquet_bucket, + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args.staging_namespace, + data_type=data_type, + file_name="all_dup_staging_rows.csv", + ), + Body=convert_dataframe_to_text(staging_dups_report), + ) + logger.info("Staging dataset duplicates saved!") + # print out all main duplicated rows + main_dups_report = get_duplicates(compare, namespace ="main") + if not main_dups_report.empty: + s3.put_object( + Bucket=args.parquet_bucket, + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args.staging_namespace, + data_type=data_type, + file_name="all_dup_main_rows.csv", + ), + Body=convert_dataframe_to_text(main_dups_report), + ) + logger.info("Main dataset duplicates saved!") else: # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( @@ -554,6 +634,7 @@ def main(): ), Body=comparison_report, ) + logger.info("Comparison report saved!") return diff --git a/tests/conftest.py b/tests/conftest.py index 52373458..faf4bd3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -225,8 +225,8 @@ def staging_dataset_with_dup_indexes(): yield pd.DataFrame( { "LogId": ["44984262767", "44984262767"], - "StartDate": ["2021-12-24T14:27:39+00:00", "2021-12-24T14:27:39+00:00"], - "EndDate": ["2021-12-24T14:40:27+00:00", "2021-12-24T14:40:27+00:00"], + "StartDate": ["2021-12-24T14:27:39", "2021-12-24T14:27:40"], + "EndDate": ["2021-12-24T14:40:27", "2021-12-24T14:40:28"], } ) diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index 5d96a3d5..d199c723 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -45,7 +45,7 @@ def test_that_get_s3_file_key_for_comparison_results_has_expected_filepath_for_s ) assert ( file_key - == "staging/comparison_result/dataset_fitbitactivitylogs_parquet_compare.txt" + == "staging/comparison_result/dataset_fitbitactivitylogs/parquet_compare.txt" ) @@ -222,6 +222,50 @@ def test_that_dataframe_to_text_returns_valid_format_for_s3_put_object( ) +def test_that_get_duplicates_returns_empty_df_if_no_dups( + valid_staging_dataset, valid_main_dataset +): + compare = datacompy.Compare( + df1=valid_staging_dataset, + df2=valid_main_dataset, + join_columns="LogId", + df1_name="staging", # Optional, defaults to 'df1' + df2_name="main", # Optional, defaults to 'df2' + cast_column_names_lower=False, + ) + staging_dups = compare_parquet.get_duplicates(compare, namespace="staging") + assert staging_dups.empty + + +def test_that_get_duplicates_returns_dups_df_if_dups_exist( + staging_dataset_with_dup_indexes, valid_main_dataset +): + compare = datacompy.Compare( + df1=staging_dataset_with_dup_indexes, + df2=valid_main_dataset, + join_columns="LogId", + df1_name="staging", # Optional, defaults to 'df1' + df2_name="main", # Optional, defaults to 'df2' + cast_column_names_lower=False, + ) + staging_dups = compare_parquet.get_duplicates(compare, namespace="staging") + assert_frame_equal( + staging_dups.reset_index(drop=True), + pd.DataFrame( + { + "LogId": ["44984262767"], + "StartDate": [ "2021-12-24T14:27:40"], + "EndDate": ["2021-12-24T14:40:28"], + } + ).reset_index(drop=True), + ) + + +def test_that_get_duplicates_raises_key_error_if_namespace_invalid(): + with pytest.raises(KeyError): + compare_parquet.get_duplicates(None, namespace="invalid") + + def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( valid_staging_dataset, valid_main_dataset ): @@ -231,6 +275,7 @@ def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( join_columns="LogId", df1_name="staging", # Optional, defaults to 'df1' df2_name="main", # Optional, defaults to 'df2' + cast_column_names_lower=False, ) staging_rows = compare_parquet.compare_row_diffs(compare, namespace="staging") main_rows = compare_parquet.compare_row_diffs(compare, namespace="main") @@ -275,6 +320,12 @@ def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( ) +def test_that_compare_row_diffs_raises_key_error_is_namespace_is_invalid( +): + with pytest.raises(KeyError): + compare_parquet.compare_row_diffs(None, namespace="invalid_namespace") + + def test_that_compare_column_names_returns_empty_msg_if_cols_are_same( valid_staging_dataset, valid_main_dataset ): From a3ef38b566ddedb247901e376aaaa6ed5b0d487a Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 20 Apr 2023 14:30:58 -0700 Subject: [PATCH 12/18] add more TODOs --- src/glue/jobs/compare_parquet_datasets.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 0a08f9c6..3c825ca6 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -119,13 +119,15 @@ def get_duplicated_columns(dataset: pd.DataFrame) -> list: def has_common_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: - """Gets the list of common columns between two dataframes""" + """Gets the list of common columns between two dataframes + TODO: Could look into depreciating this and using datacompy.intersect_columns function""" common_cols = staging_dataset.columns.intersection(main_dataset.columns).tolist() return common_cols != [] def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) -> list: - """Gets the list of missing columns present in main but not in staging""" + """Gets the list of missing columns present in main but not in staging + TODO: Could look into depreciating this and using datacompy.df2_unq_columns function""" missing_cols = main_dataset.columns.difference(staging_dataset.columns).tolist() return missing_cols @@ -133,7 +135,8 @@ def get_missing_cols(staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame) def get_additional_cols( staging_dataset: pd.DataFrame, main_dataset: pd.DataFrame ) -> list: - """Gets the list of additional columns present in staging but not in main""" + """Gets the list of additional columns present in staging but not in main + TODO: Could look into depreciating this and using datacompy.df1_unq_columns function""" add_cols = staging_dataset.columns.difference(main_dataset.columns).tolist() return add_cols @@ -521,6 +524,7 @@ def main(): ) if data_types_to_compare: for data_type in data_types_to_compare: + logger.info(f"Running comparison report for {data_type}") compare_dict = compare_datasets_by_data_type( parquet_bucket=args.parquet_bucket, staging_namespace=args.staging_namespace, @@ -592,7 +596,7 @@ def main(): logger.info("Different main dataset rows saved!") # print out all staging duplicated rows - staging_dups_report = get_duplicates(compare, namespace ="staging") + staging_dups_report = get_duplicates(compare, namespace="staging") if not staging_dups_report.empty: s3.put_object( Bucket=args.parquet_bucket, @@ -605,7 +609,7 @@ def main(): ) logger.info("Staging dataset duplicates saved!") # print out all main duplicated rows - main_dups_report = get_duplicates(compare, namespace ="main") + main_dups_report = get_duplicates(compare, namespace="main") if not main_dups_report.empty: s3.put_object( Bucket=args.parquet_bucket, From 52dcca2a72ae4718f6f79fd492bf000fc1792c94 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Fri, 21 Apr 2023 10:32:58 -0700 Subject: [PATCH 13/18] use resolvedoptions from aws glue, adjust all args and args validation func, remove unused lib --- src/glue/jobs/compare_parquet_datasets.py | 99 +++++++++-------------- tests/test_compare_parquet_datasets.py | 11 ++- 2 files changed, 43 insertions(+), 67 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 3c825ca6..6a3c1ce5 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -1,16 +1,16 @@ import argparse from io import StringIO import json -import os import logging +import os +import sys - +from awsglue.utils import getResolvedOptions import boto3 import datacompy import pandas as pd from pyarrow import fs import pyarrow.parquet as pq -import synapseclient logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -38,52 +38,29 @@ } -def read_args(): - parser = argparse.ArgumentParser( - description=( - "Compare parquet datasets between two namespaced S3 bucket locations" - ) - ) - parser.add_argument( - "--staging-namespace", - required=True, - type=validate_args, - help="The name of the staging namespace to use", - default="staging", +def read_args() -> dict: + """Returns the specific params that our code needs to run""" + args = getResolvedOptions( + sys.argv, ["staging-namespace", "main-namespace", "parquet-bucket"] ) - parser.add_argument( - "--main-namespace", - required=True, - type=validate_args, - help=("The name of the main namespace to use"), - default="main", - ) - parser.add_argument( - "--parquet-bucket", - required=True, - type=validate_args, - help=("The name of the S3 bucket containing the S3 files to compare"), - default="recover-dev-processed_data", - ) - args = parser.parse_args() + for arg in args: + validate_args(args[arg]) return args -def validate_args(value: str) -> str: +def validate_args(value: str) -> None: """Checks to make sure none of the input command line arguments are empty strings Args: value (str): the value of the command line argument parsed by argparse Raises: - argparse.ArgumentTypeError: when value is an empty string - - Returns: - str: the value as is + ValueError: when value is an empty string """ if value == "": - raise argparse.ArgumentTypeError("Argument value cannot be an empty string") - return value + raise ValueError("Argument value cannot be an empty string") + else: + return None def get_s3_file_key_for_comparison_results( @@ -507,28 +484,28 @@ def compare_datasets_by_data_type( def main(): args = read_args() s3 = boto3.client("s3") - aws_session = boto3.session.Session(profile_name="default", region_name="us-east-1") + aws_session = boto3.session.Session(region_name="us-east-1") fs = get_S3FileSystem_from_session(aws_session) data_types_to_compare = get_data_types_to_compare( s3, - args.parquet_bucket, - main_namespace=args.main_namespace, - staging_namespace=args.staging_namespace, + args["parquet_bucket"], + main_namespace=args["main_namespace"], + staging_namespace=args["staging_namespace"], ) data_types_diff = compare_dataset_data_types( s3, - args.parquet_bucket, - main_namespace=args.main_namespace, - staging_namespace=args.staging_namespace, + args["parquet_bucket"], + main_namespace=args["main_namespace"], + staging_namespace=args["staging_namespace"], ) if data_types_to_compare: for data_type in data_types_to_compare: logger.info(f"Running comparison report for {data_type}") compare_dict = compare_datasets_by_data_type( - parquet_bucket=args.parquet_bucket, - staging_namespace=args.staging_namespace, - main_namespace=args.main_namespace, + parquet_bucket=args["parquet_bucket"], + staging_namespace=args["staging_namespace"], + main_namespace=args["main_namespace"], s3_filesystem=fs, data_type=data_type, ) @@ -540,9 +517,9 @@ def main(): ) # save comparison report to report folder in staging namespace s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="parquet_compare.txt", ), @@ -559,9 +536,9 @@ def main(): mismatch_cols_report = compare.all_mismatch() if not mismatch_cols_report.empty: s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="all_mismatch_cols.csv", ), @@ -572,9 +549,9 @@ def main(): staging_rows_report = compare_row_diffs(compare, namespace="staging") if not staging_rows_report.empty: s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="all_diff_staging_rows.csv", ), @@ -585,9 +562,9 @@ def main(): main_rows_report = compare_row_diffs(compare, namespace="main") if not main_rows_report.empty: s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="all_diff_main_rows.csv", ), @@ -599,9 +576,9 @@ def main(): staging_dups_report = get_duplicates(compare, namespace="staging") if not staging_dups_report.empty: s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="all_dup_staging_rows.csv", ), @@ -612,9 +589,9 @@ def main(): main_dups_report = get_duplicates(compare, namespace="main") if not main_dups_report.empty: s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=data_type, file_name="all_dup_main_rows.csv", ), @@ -630,9 +607,9 @@ def main(): ) print(comparison_report) s3.put_object( - Bucket=args.parquet_bucket, + Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( - staging_namespace=args.staging_namespace, + staging_namespace=args["staging_namespace"], data_type=None, file_name="data_types_compare.txt", ), diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index d199c723..d794363e 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -12,12 +12,12 @@ def test_that_validate_args_raises_exception_when_input_value_is_empty_string(): - with pytest.raises(argparse.ArgumentTypeError): + with pytest.raises(ValueError): compare_parquet.validate_args(value="") -def test_that_validate_args_returns_value_when_value_is_not_an_empty_string(): - assert compare_parquet.validate_args(value="TEST") == "TEST" +def test_that_validate_args_returns_nothing_when_value_is_not_an_empty_string(): + assert compare_parquet.validate_args(value="TEST") == None def test_that_get_parquet_dataset_s3_path_returns_correct_filepath(parquet_bucket_name): @@ -254,7 +254,7 @@ def test_that_get_duplicates_returns_dups_df_if_dups_exist( pd.DataFrame( { "LogId": ["44984262767"], - "StartDate": [ "2021-12-24T14:27:40"], + "StartDate": ["2021-12-24T14:27:40"], "EndDate": ["2021-12-24T14:40:28"], } ).reset_index(drop=True), @@ -320,8 +320,7 @@ def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( ) -def test_that_compare_row_diffs_raises_key_error_is_namespace_is_invalid( -): +def test_that_compare_row_diffs_raises_key_error_is_namespace_is_invalid(): with pytest.raises(KeyError): compare_parquet.compare_row_diffs(None, namespace="invalid_namespace") From 4c962162755e72d9f5283e340a7cee549bcb4a1f Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 24 Apr 2023 10:03:20 -0700 Subject: [PATCH 14/18] add datatype subtypes handling --- src/glue/jobs/compare_parquet_datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 6a3c1ce5..c51f1f9f 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -327,10 +327,13 @@ def compare_datasets_and_output_report( or Cerebro tables. This will also likely be necessary once we have more data coming in over weeks and months """ + # there exists folders with data subtypes, but we want to just merge on + # the main level datatypes + main_data_type = f'dataset_{data_type.split("_")[1]}' compare = datacompy.Compare( df1=staging_dataset, df2=main_dataset, - join_columns=INDEX_FIELD_MAP[data_type], + join_columns=INDEX_FIELD_MAP[main_data_type], abs_tol=0, # Optional, defaults to 0 rel_tol=0, # Optional, defaults to 0 df1_name=staging_namespace, # Optional, defaults to 'df1' From c2cc21f60932d0e3d58fa9259ebb7da253c03282 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Mon, 24 Apr 2023 10:55:08 -0700 Subject: [PATCH 15/18] add exception catching by dataset --- src/glue/jobs/compare_parquet_datasets.py | 174 +++++++++++----------- 1 file changed, 89 insertions(+), 85 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index c51f1f9f..f8859a22 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -505,102 +505,106 @@ def main(): if data_types_to_compare: for data_type in data_types_to_compare: logger.info(f"Running comparison report for {data_type}") - compare_dict = compare_datasets_by_data_type( - parquet_bucket=args["parquet_bucket"], - staging_namespace=args["staging_namespace"], - main_namespace=args["main_namespace"], - s3_filesystem=fs, - data_type=data_type, - ) - # update comparison report with the data_type differences message - comparison_report = add_additional_msg_to_comparison_report( - compare_dict["comparison_report"], - add_msgs=data_types_diff, - msg_type="data_type_diff", - ) - # save comparison report to report folder in staging namespace - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( + + try: + compare_dict = compare_datasets_by_data_type( + parquet_bucket=args["parquet_bucket"], staging_namespace=args["staging_namespace"], + main_namespace=args["main_namespace"], + s3_filesystem=fs, data_type=data_type, - file_name="parquet_compare.txt", - ), - Body=comparison_report, - ) - logger.info("Comparison report saved!") - # additional report print outs - compare = compare_dict["compare_obj"] - # TODO: Find out if pandas.to_csv, or direct write to S3 - # is more efficient. s3.put_object is very slow and memory heavy - # esp. if using StringIO conversion - - # print out all mismatch columns - mismatch_cols_report = compare.all_mismatch() - if not mismatch_cols_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_mismatch_cols.csv", - ), - Body=convert_dataframe_to_text(mismatch_cols_report), - ) - logger.info("Mismatch columns saved!") - # print out all staging rows that are different to main - staging_rows_report = compare_row_diffs(compare, namespace="staging") - if not staging_rows_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_diff_staging_rows.csv", - ), - Body=convert_dataframe_to_text(staging_rows_report), ) - logger.info("Different staging dataset rows saved!") - # print out all main rows that are different to staging - main_rows_report = compare_row_diffs(compare, namespace="main") - if not main_rows_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_diff_main_rows.csv", - ), - Body=convert_dataframe_to_text(main_rows_report), - ) - logger.info("Different main dataset rows saved!") - - # print out all staging duplicated rows - staging_dups_report = get_duplicates(compare, namespace="staging") - if not staging_dups_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_dup_staging_rows.csv", - ), - Body=convert_dataframe_to_text(staging_dups_report), + # update comparison report with the data_type differences message + comparison_report = add_additional_msg_to_comparison_report( + compare_dict["comparison_report"], + add_msgs=data_types_diff, + msg_type="data_type_diff", ) - logger.info("Staging dataset duplicates saved!") - # print out all main duplicated rows - main_dups_report = get_duplicates(compare, namespace="main") - if not main_dups_report.empty: + # save comparison report to report folder in staging namespace s3.put_object( Bucket=args["parquet_bucket"], Key=get_s3_file_key_for_comparison_results( staging_namespace=args["staging_namespace"], data_type=data_type, - file_name="all_dup_main_rows.csv", + file_name="parquet_compare.txt", ), - Body=convert_dataframe_to_text(main_dups_report), + Body=comparison_report, ) - logger.info("Main dataset duplicates saved!") + logger.info("Comparison report saved!") + # additional report print outs + compare = compare_dict["compare_obj"] + # TODO: Find out if pandas.to_csv, or direct write to S3 + # is more efficient. s3.put_object is very slow and memory heavy + # esp. if using StringIO conversion + + # print out all mismatch columns + mismatch_cols_report = compare.all_mismatch() + if not mismatch_cols_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_mismatch_cols.csv", + ), + Body=convert_dataframe_to_text(mismatch_cols_report), + ) + logger.info("Mismatch columns saved!") + # print out all staging rows that are different to main + staging_rows_report = compare_row_diffs(compare, namespace="staging") + if not staging_rows_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_diff_staging_rows.csv", + ), + Body=convert_dataframe_to_text(staging_rows_report), + ) + logger.info("Different staging dataset rows saved!") + # print out all main rows that are different to staging + main_rows_report = compare_row_diffs(compare, namespace="main") + if not main_rows_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_diff_main_rows.csv", + ), + Body=convert_dataframe_to_text(main_rows_report), + ) + logger.info("Different main dataset rows saved!") + + # print out all staging duplicated rows + staging_dups_report = get_duplicates(compare, namespace="staging") + if not staging_dups_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_dup_staging_rows.csv", + ), + Body=convert_dataframe_to_text(staging_dups_report), + ) + logger.info("Staging dataset duplicates saved!") + # print out all main duplicated rows + main_dups_report = get_duplicates(compare, namespace="main") + if not main_dups_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_dup_main_rows.csv", + ), + Body=convert_dataframe_to_text(main_dups_report), + ) + logger.info("Main dataset duplicates saved!") + except Exception as e: + logger.info(f"ERROR: {e} with {data_type}. Continuing to next dataset.") else: # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( From 3477c1812b0b57f659d89d5aa58c5eda58b48b72 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Wed, 26 Apr 2023 15:53:45 -0700 Subject: [PATCH 16/18] add conversion to categorical, make it run per data type --- src/glue/jobs/compare_parquet_datasets.py | 207 +++++++++++----------- tests/test_compare_parquet_datasets.py | 4 + 2 files changed, 110 insertions(+), 101 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index f8859a22..6dbc2ac6 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -41,13 +41,20 @@ def read_args() -> dict: """Returns the specific params that our code needs to run""" args = getResolvedOptions( - sys.argv, ["staging-namespace", "main-namespace", "parquet-bucket"] + sys.argv, ["data-type", "staging-namespace", "main-namespace", "parquet-bucket"] ) for arg in args: validate_args(args[arg]) return args +def convert_to_categorical(dataset: pd.DataFrame) -> pd.DataFrame: + """Converts the dataset to have all categorical cols""" + for col in dataset.columns: + dataset[col] = dataset[col].astype("category") + return dataset + + def validate_args(value: str) -> None: """Checks to make sure none of the input command line arguments are empty strings @@ -465,6 +472,8 @@ def compare_datasets_by_data_type( ) compare = None else: + staging_dataset = convert_to_categorical(staging_dataset) + main_dataset = convert_to_categorical(main_dataset) compare = compare_datasets_and_output_report( data_type=data_type, staging_dataset=staging_dataset, @@ -489,6 +498,7 @@ def main(): s3 = boto3.client("s3") aws_session = boto3.session.Session(region_name="us-east-1") fs = get_S3FileSystem_from_session(aws_session) + data_type = args["data_type"] data_types_to_compare = get_data_types_to_compare( s3, @@ -503,108 +513,103 @@ def main(): staging_namespace=args["staging_namespace"], ) if data_types_to_compare: - for data_type in data_types_to_compare: - logger.info(f"Running comparison report for {data_type}") - - try: - compare_dict = compare_datasets_by_data_type( - parquet_bucket=args["parquet_bucket"], + logger.info(f"Running comparison report for {data_type}") + compare_dict = compare_datasets_by_data_type( + parquet_bucket=args["parquet_bucket"], + staging_namespace=args["staging_namespace"], + main_namespace=args["main_namespace"], + s3_filesystem=fs, + data_type=data_type, + ) + # update comparison report with the data_type differences message + comparison_report = add_additional_msg_to_comparison_report( + compare_dict["comparison_report"], + add_msgs=data_types_diff, + msg_type="data_type_diff", + ) + # save comparison report to report folder in staging namespace + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="parquet_compare.txt", + ), + Body=comparison_report, + ) + logger.info("Comparison report saved!") + # additional report print outs + compare = compare_dict["compare_obj"] + # TODO: Find out if pandas.to_csv, or direct write to S3 + # is more efficient. s3.put_object is very slow and memory heavy + # esp. if using StringIO conversion + + # print out all mismatch columns + mismatch_cols_report = compare.all_mismatch() + if not mismatch_cols_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_mismatch_cols.csv", + ), + Body=convert_dataframe_to_text(mismatch_cols_report), + ) + logger.info("Mismatch columns saved!") + # print out all staging rows that are different to main + staging_rows_report = compare_row_diffs(compare, namespace="staging") + if not staging_rows_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_diff_staging_rows.csv", + ), + Body=convert_dataframe_to_text(staging_rows_report), + ) + logger.info("Different staging dataset rows saved!") + # print out all main rows that are different to staging + main_rows_report = compare_row_diffs(compare, namespace="main") + if not main_rows_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_diff_main_rows.csv", + ), + Body=convert_dataframe_to_text(main_rows_report), + ) + logger.info("Different main dataset rows saved!") + + # print out all staging duplicated rows + staging_dups_report = get_duplicates(compare, namespace="staging") + if not staging_dups_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( + staging_namespace=args["staging_namespace"], + data_type=data_type, + file_name="all_dup_staging_rows.csv", + ), + Body=convert_dataframe_to_text(staging_dups_report), + ) + logger.info("Staging dataset duplicates saved!") + # print out all main duplicated rows + main_dups_report = get_duplicates(compare, namespace="main") + if not main_dups_report.empty: + s3.put_object( + Bucket=args["parquet_bucket"], + Key=get_s3_file_key_for_comparison_results( staging_namespace=args["staging_namespace"], - main_namespace=args["main_namespace"], - s3_filesystem=fs, data_type=data_type, - ) - # update comparison report with the data_type differences message - comparison_report = add_additional_msg_to_comparison_report( - compare_dict["comparison_report"], - add_msgs=data_types_diff, - msg_type="data_type_diff", - ) - # save comparison report to report folder in staging namespace - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="parquet_compare.txt", - ), - Body=comparison_report, - ) - logger.info("Comparison report saved!") - # additional report print outs - compare = compare_dict["compare_obj"] - # TODO: Find out if pandas.to_csv, or direct write to S3 - # is more efficient. s3.put_object is very slow and memory heavy - # esp. if using StringIO conversion - - # print out all mismatch columns - mismatch_cols_report = compare.all_mismatch() - if not mismatch_cols_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_mismatch_cols.csv", - ), - Body=convert_dataframe_to_text(mismatch_cols_report), - ) - logger.info("Mismatch columns saved!") - # print out all staging rows that are different to main - staging_rows_report = compare_row_diffs(compare, namespace="staging") - if not staging_rows_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_diff_staging_rows.csv", - ), - Body=convert_dataframe_to_text(staging_rows_report), - ) - logger.info("Different staging dataset rows saved!") - # print out all main rows that are different to staging - main_rows_report = compare_row_diffs(compare, namespace="main") - if not main_rows_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_diff_main_rows.csv", - ), - Body=convert_dataframe_to_text(main_rows_report), - ) - logger.info("Different main dataset rows saved!") - - # print out all staging duplicated rows - staging_dups_report = get_duplicates(compare, namespace="staging") - if not staging_dups_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_dup_staging_rows.csv", - ), - Body=convert_dataframe_to_text(staging_dups_report), - ) - logger.info("Staging dataset duplicates saved!") - # print out all main duplicated rows - main_dups_report = get_duplicates(compare, namespace="main") - if not main_dups_report.empty: - s3.put_object( - Bucket=args["parquet_bucket"], - Key=get_s3_file_key_for_comparison_results( - staging_namespace=args["staging_namespace"], - data_type=data_type, - file_name="all_dup_main_rows.csv", - ), - Body=convert_dataframe_to_text(main_dups_report), - ) - logger.info("Main dataset duplicates saved!") - except Exception as e: - logger.info(f"ERROR: {e} with {data_type}. Continuing to next dataset.") + file_name="all_dup_main_rows.csv", + ), + Body=convert_dataframe_to_text(main_dups_report), + ) + logger.info("Main dataset duplicates saved!") else: # update comparison report with the data_type differences message comparison_report = add_additional_msg_to_comparison_report( diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index d794363e..c189c267 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -11,6 +11,10 @@ from src.glue.jobs import compare_parquet_datasets as compare_parquet +def test_that_convert_to_categorical_returns_expected_dtype(valid_staging_dataset): + cat_dataset = compare_parquet.convert_to_categorical(valid_staging_dataset) + assert valid_staging_dataset["LogId"].dtype == "category" + def test_that_validate_args_raises_exception_when_input_value_is_empty_string(): with pytest.raises(ValueError): compare_parquet.validate_args(value="") From 2888fe90bdae281cc9ca9dc71964140be19ce61a Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 27 Apr 2023 09:59:22 -0700 Subject: [PATCH 17/18] add logging for memory for datasets --- src/glue/jobs/compare_parquet_datasets.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 6dbc2ac6..1fa56a2f 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -474,6 +474,14 @@ def compare_datasets_by_data_type( else: staging_dataset = convert_to_categorical(staging_dataset) main_dataset = convert_to_categorical(main_dataset) + logger.info( + f"{staging_namespace} dataset memory usage:" + f"{staging_dataset.memory_usage(deep=True).sum()/1e+6} MB" + ) + logger.info( + f"{main_namespace} dataset memory usage:" + f"{main_dataset.memory_usage(deep=True).sum()/1e+6} MB" + ) compare = compare_datasets_and_output_report( data_type=data_type, staging_dataset=staging_dataset, From c6a4184da833a642bec81fa29bc2c57cc12b6c79 Mon Sep 17 00:00:00 2001 From: Rixing Xu Date: Thu, 27 Apr 2023 11:30:35 -0700 Subject: [PATCH 18/18] remove categorical conversion, fix test name --- src/glue/jobs/compare_parquet_datasets.py | 9 --------- tests/test_compare_parquet_datasets.py | 6 +----- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/glue/jobs/compare_parquet_datasets.py b/src/glue/jobs/compare_parquet_datasets.py index 1fa56a2f..1627ad02 100644 --- a/src/glue/jobs/compare_parquet_datasets.py +++ b/src/glue/jobs/compare_parquet_datasets.py @@ -48,13 +48,6 @@ def read_args() -> dict: return args -def convert_to_categorical(dataset: pd.DataFrame) -> pd.DataFrame: - """Converts the dataset to have all categorical cols""" - for col in dataset.columns: - dataset[col] = dataset[col].astype("category") - return dataset - - def validate_args(value: str) -> None: """Checks to make sure none of the input command line arguments are empty strings @@ -472,8 +465,6 @@ def compare_datasets_by_data_type( ) compare = None else: - staging_dataset = convert_to_categorical(staging_dataset) - main_dataset = convert_to_categorical(main_dataset) logger.info( f"{staging_namespace} dataset memory usage:" f"{staging_dataset.memory_usage(deep=True).sum()/1e+6} MB" diff --git a/tests/test_compare_parquet_datasets.py b/tests/test_compare_parquet_datasets.py index c189c267..2285e95b 100644 --- a/tests/test_compare_parquet_datasets.py +++ b/tests/test_compare_parquet_datasets.py @@ -11,10 +11,6 @@ from src.glue.jobs import compare_parquet_datasets as compare_parquet -def test_that_convert_to_categorical_returns_expected_dtype(valid_staging_dataset): - cat_dataset = compare_parquet.convert_to_categorical(valid_staging_dataset) - assert valid_staging_dataset["LogId"].dtype == "category" - def test_that_validate_args_raises_exception_when_input_value_is_empty_string(): with pytest.raises(ValueError): compare_parquet.validate_args(value="") @@ -287,7 +283,7 @@ def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( assert main_rows.empty -def test_that_compare_row_diffs_returns_empty_df_if_columns_are_not_diff( +def test_that_compare_row_diffs_returns_df_if_columns_are_not_diff( staging_dataset_with_diff_num_of_rows, valid_main_dataset ): compare = datacompy.Compare(