-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SAT: check records to comply jsonschema format field (#5661)
* SAT: check records to comply jsonschema format field
- Loading branch information
Showing
4 changed files
with
97 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ | |
"pytest-timeout~=1.4", | ||
"pprintpp~=0.4", | ||
"dpath~=2.0.1", | ||
"jsonschema~=3.2.0", | ||
] | ||
|
||
setuptools.setup( | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,8 @@ def record_schema_fixture(): | |
|
||
|
||
@pytest.fixture(name="configured_catalog") | ||
def catalog_fixture(record_schema) -> ConfiguredAirbyteCatalog: | ||
def catalog_fixture(request, record_schema) -> ConfiguredAirbyteCatalog: | ||
record_schema = request.param if hasattr(request, "param") else record_schema | ||
stream = ConfiguredAirbyteStream( | ||
stream=AirbyteStream(name="my_stream", json_schema=record_schema), | ||
sync_mode=SyncMode.full_refresh, | ||
|
@@ -96,3 +97,61 @@ def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog): | |
assert len(streams_with_errors) == 1, "only one stream" | ||
assert len(streams_with_errors["my_stream"]) == 3, "only first error for each field" | ||
assert errors == ["123 is not of type 'null', 'string'", "'text' is not of type 'number'", "None is not of type 'string'"] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"record, configured_catalog, valid", | ||
[ | ||
# Send null data | ||
({"a": None}, {"type": "object", "properties": {"a": {"type": "string", "format": "time"}}}, False), | ||
# time | ||
({"a": "sdf"}, {"type": "object", "properties": {"a": {"type": "string", "format": "time"}}}, False), | ||
({"a": "12:00"}, {"type": "object", "properties": {"a": {"type": "string", "format": "time"}}}, False), | ||
({"a": "12:00:90"}, {"type": "object", "properties": {"a": {"type": "string", "format": "time"}}}, False), | ||
({"a": "12:00:22"}, {"type": "object", "properties": {"a": {"type": "string", "format": "time"}}}, True), | ||
# date | ||
({"a": "12:00:90"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date"}}}, False), | ||
({"a": "2020-12-20"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date"}}}, True), | ||
({"a": "2020-20-20"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date"}}}, False), | ||
# date-time | ||
# full date-time format with timezone only valid, according to https://datatracker.ietf.org/doc/html/rfc3339#section-5.6 | ||
({"a": "12:11:00"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, False), | ||
({"a": "2018-11-13 20:20:39"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, True), | ||
({"a": "2021-08-10T12:43:15"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, True), | ||
({"a": "2021-08-10T12:43:15Z"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, True), | ||
({"a": "2018-11-13T20:20:39+00:00"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, True), | ||
({"a": "2018-21-13T20:20:39+00:00"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, False), | ||
# This is valid for postgres sql but not valid for bigquery | ||
({"a": "2014-09-27 9:35z"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, False), | ||
# Seconds are obligatory for bigquery timestamp | ||
({"a": "2014-09-27 9:35"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, False), | ||
({"a": "2014-09-27 9:35:0z"}, {"type": "object", "properties": {"a": {"type": "string", "format": "date-time"}}}, True), | ||
({"a": "2018-11-13 20:20:39"}, {"type": "object", "properties": {"a": {"type": "string", "format": "email"}}}, False), | ||
({"a": "[email protected]"}, {"type": "object", "properties": {"a": {"type": "string", "format": "email"}}}, True), | ||
({"a": "Пример@example.com"}, {"type": "object", "properties": {"a": {"type": "string", "format": "email"}}}, True), | ||
({"a": "写电子邮件@子邮件"}, {"type": "object", "properties": {"a": {"type": "string", "format": "email"}}}, True), | ||
# hostname | ||
({"a": "2018-11-13 20:20:39"}, {"type": "object", "properties": {"a": {"type": "string", "format": "hostname"}}}, False), | ||
({"a": "[email protected]"}, {"type": "object", "properties": {"a": {"type": "string", "format": "hostname"}}}, False), | ||
({"a": "localhost"}, {"type": "object", "properties": {"a": {"type": "string", "format": "hostname"}}}, True), | ||
({"a": "example.com"}, {"type": "object", "properties": {"a": {"type": "string", "format": "hostname"}}}, True), | ||
# ipv4 | ||
({"a": "example.com"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv4"}}}, False), | ||
({"a": "0.0.0.1000"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv4"}}}, False), | ||
({"a": "0.0.0.0"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv4"}}}, True), | ||
# ipv6 | ||
({"a": "example.com"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv6"}}}, False), | ||
({"a": "1080:0:0:0:8:800:200C:417A"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv6"}}}, True), | ||
({"a": "::1"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv6"}}}, True), | ||
({"a": "::"}, {"type": "object", "properties": {"a": {"type": "string", "format": "ipv6"}}}, True), | ||
], | ||
indirect=["configured_catalog"], | ||
) | ||
def test_validate_records_format(record, configured_catalog, valid): | ||
records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0)] | ||
streams_with_errors = verify_records_schema(records, configured_catalog) | ||
if valid: | ||
assert not streams_with_errors | ||
else: | ||
assert streams_with_errors, f"Record {record} should produce errors against {configured_catalog.streams[0].stream.json_schema}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters