Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨Source S3 (v4): Set decimal_as_float to True for parquet files #29342

Merged
merged 29 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
eaba483
[ISSUE #28893] infer csv schema
maxi297 Aug 4, 2023
60757c1
[ISSUE #28893] align with pyarrow
maxi297 Aug 4, 2023
f49235d
Automated Commit - Formatting Changes
maxi297 Aug 4, 2023
a394666
[ISSUE #28893] legacy inference and infer only when needed
maxi297 Aug 7, 2023
4f9d162
[ISSUE #28893] fix scenario tests
maxi297 Aug 7, 2023
0617c82
[ISSUE #28893] using discovered schema as part of read
maxi297 Aug 7, 2023
d157aa3
[ISSUE #28893] self-review + cleanup
maxi297 Aug 8, 2023
57b011f
[ISSUE #28893] fix test
maxi297 Aug 8, 2023
71cdca9
[ISSUE #28893] code review part #1
maxi297 Aug 9, 2023
ef8f5f5
Merge branch 'master' into issue-28893/infer-schema-csv
maxi297 Aug 9, 2023
f651d03
[ISSUE #28893] code review part #2
maxi297 Aug 9, 2023
a573a89
Fix test
maxi297 Aug 9, 2023
0ce37e5
formatcdk
maxi297 Aug 9, 2023
82db6c3
[ISSUE #28893] code review
maxi297 Aug 9, 2023
3027a4f
FIX test log level
maxi297 Aug 9, 2023
ac91730
Re-adding failing tests
maxi297 Aug 9, 2023
f1a60ba
[ISSUE #28893] improve inferrence to consider multiple types per value
maxi297 Aug 10, 2023
53c9248
set decimal_as_float to True
girarda Aug 10, 2023
c71bb95
update
girarda Aug 10, 2023
7113603
Merge branch 'master' into issue-28893/infer-schema-csv
maxi297 Aug 10, 2023
c9e2004
Automated Commit - Formatting Changes
maxi297 Aug 10, 2023
e6a0b4d
add file adapters for avro, csv, jsonl, and parquet
brianjlai Aug 11, 2023
2f827c0
fix try catch
brianjlai Aug 11, 2023
cbbfe76
update
girarda Aug 11, 2023
f9fc565
merge
girarda Aug 11, 2023
8c25728
format
girarda Aug 11, 2023
868a597
pr feedback with a few additional default options set
brianjlai Aug 11, 2023
5b0c44f
merge
girarda Aug 14, 2023
408f960
merge
girarda Aug 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,11 @@ def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat,
if "autogenerate_column_names" in advanced_options:
csv_options["autogenerate_column_names"] = advanced_options["autogenerate_column_names"]
return csv_options

elif isinstance(format_options, JsonlFormat):
return {"filetype": "jsonl"}
elif isinstance(format_options, ParquetFormat):
return {"filetype": "parquet"}
return {"filetype": "parquet", "decimal_as_float": True}
else:
# This should never happen because it would fail schema validation
raise ValueError(f"Format filetype {format_options} is not a supported file type")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@
"aws_secret_access_key": "some_secret",
"endpoint": "https://external-s3.com",
"path_prefix": "a_folder/",
"start_date": "2022-01-01T01:02:03Z"

"start_date": "2022-01-01T01:02:03Z",
},
"format": {
"filetype": "avro",
},
"path_pattern": "**/*.avro",
"schema": '{"col1": "string", "col2": "integer"}'
"schema": '{"col1": "string", "col2": "integer"}',
},
{
"bucket": "test_bucket",
Expand All @@ -42,13 +41,11 @@
"globs": ["a_folder/**/*.avro"],
"validation_policy": "Emit Record",
"input_schema": '{"col1": "string", "col2": "integer"}',
"format": {
"filetype": "avro"
}
"format": {"filetype": "avro"},
}
]
}
, id="test_convert_legacy_config"
],
},
id="test_convert_legacy_config",
),
pytest.param(
{
Expand All @@ -70,15 +67,13 @@
"file_type": "avro",
"globs": ["**/*.avro"],
"validation_policy": "Emit Record",
"format": {
"filetype": "avro"
}
"format": {"filetype": "avro"},
}
]
}
, id="test_convert_no_optional_fields"
],
},
id="test_convert_no_optional_fields",
),
]
],
)
def test_convert_legacy_config(legacy_config, expected_config):
parsed_legacy_config = SourceS3Spec(**legacy_config)
Expand All @@ -101,8 +96,8 @@ def test_convert_legacy_config(legacy_config, expected_config):
"encoding": "ansi",
"double_quote": False,
"newlines_in_values": True,
"additional_reader_options": "{\"strings_can_be_null\": true}",
"advanced_options": "{\"skip_rows\": 3, \"skip_rows_after_names\": 5, \"autogenerate_column_names\": true}",
"additional_reader_options": '{"strings_can_be_null": true}',
"advanced_options": '{"skip_rows": 3, "skip_rows_after_names": 5, "autogenerate_column_names": true}',
"blocksize": 20000,
},
{
Expand All @@ -122,7 +117,8 @@ def test_convert_legacy_config(legacy_config, expected_config):
"autogenerate_column_names": True,
},
None,
id="test_csv_all_legacy_options_set"),
id="test_csv_all_legacy_options_set",
),
pytest.param(
"csv",
{
Expand All @@ -145,14 +141,15 @@ def test_convert_legacy_config(legacy_config, expected_config):
"strings_can_be_null": False,
},
None,
id="test_csv_only_required_options"),
id="test_csv_only_required_options",
),
pytest.param(
"csv",
{},
{
"filetype": "csv",
"delimiter": ",",
"quote_char": "\"",
"quote_char": '"',
"encoding": "utf8",
"double_quote": True,
"null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"],
Expand All @@ -162,23 +159,26 @@ def test_convert_legacy_config(legacy_config, expected_config):
"strings_can_be_null": False,
},
None,
id="test_csv_empty_format"),
id="test_csv_empty_format",
),
pytest.param(
"csv",
{
"additional_reader_options": "{\"not_valid\": \"at all}",
"additional_reader_options": '{"not_valid": "at all}',
},
None,
ValueError,
id="test_malformed_additional_reader_options"),
id="test_malformed_additional_reader_options",
),
pytest.param(
"csv",
{
"advanced_options": "{\"not_valid\": \"at all}",
"advanced_options": '{"not_valid": "at all}',
},
None,
ValueError,
id="test_malformed_advanced_options"),
id="test_malformed_advanced_options",
),
pytest.param(
"jsonl",
{
Expand All @@ -187,11 +187,10 @@ def test_convert_legacy_config(legacy_config, expected_config):
"unexpected_field_behavior": "ignore",
"block_size": 0,
},
{
"filetype": "jsonl"
},
{"filetype": "jsonl"},
None,
id="test_jsonl_format"),
id="test_jsonl_format",
),
pytest.param(
"parquet",
{
Expand All @@ -200,22 +199,20 @@ def test_convert_legacy_config(legacy_config, expected_config):
"batch_size": 65536,
"buffer_size": 100,
},
{
"filetype": "parquet"
},
{"filetype": "parquet", "decimal_as_float": True},
None,
id="test_parquet_format"),
id="test_parquet_format",
),
pytest.param(
"avro",
{
"filetype": "avro",
},
{
"filetype": "avro"
},
{"filetype": "avro"},
None,
id="test_avro_format"),
]
id="test_avro_format",
),
],
)
def test_convert_file_format(file_type, legacy_format_config, expected_format_config, expected_error):
legacy_config = {
Expand All @@ -225,7 +222,6 @@ def test_convert_file_format(file_type, legacy_format_config, expected_format_co
"bucket": "test_bucket",
"aws_access_key_id": "some_access_key",
"aws_secret_access_key": "some_secret",

},
"format": legacy_format_config,
"path_pattern": f"**/*.{file_type}",
Expand All @@ -241,9 +237,9 @@ def test_convert_file_format(file_type, legacy_format_config, expected_format_co
"file_type": file_type,
"globs": [f"**/*.{file_type}"],
"validation_policy": "Emit Record",
"format": expected_format_config
"format": expected_format_config,
}
]
],
}

parsed_legacy_config = SourceS3Spec(**legacy_config)
Expand Down
Loading