Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Allow Singer schemas to include the required and enum fields #917

Merged
136 changes: 136 additions & 0 deletions singer_sdk/helpers/_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# pylint: disable=redefined-builtin, too-many-arguments, invalid-name
Jack-Burnett marked this conversation as resolved.
Show resolved Hide resolved
"""Provides an object model for JSON Schema."""

import json

from singer import Schema

# These are keys defined in the JSON Schema spec that do not themselves contain
# schemas (or lists of schemas)
STANDARD_KEYS = [
"title",
"description",
"minimum",
"maximum",
"exclusiveMinimum",
"exclusiveMaximum",
"multipleOf",
"maxLength",
"minLength",
"format",
"type",
"required",
"enum",
# These are NOT simple keys (they can contain schemas themselves). We could
# consider adding extra handling to them.
"additionalProperties",
"anyOf",
"patternProperties",
]


class SchemaPlus(Schema): # pylint: disable=too-many-instance-attributes
edgarrmondragon marked this conversation as resolved.
Show resolved Hide resolved
Jack-Burnett marked this conversation as resolved.
Show resolved Hide resolved
"""Object model for JSON Schema.

Tap and Target authors may find this to be more convenient than
working directly with JSON Schema data structures.

This is based on, and overwrites
https://github.com/transferwise/pipelinewise-singer-python/blob/master/singer/schema.py.
This is because we wanted to expand it with extra STANDARD_KEYS.

"""

# pylint: disable=too-many-locals
def __init__(
self,
type=None,
format=None,
properties=None,
items=None,
description=None,
minimum=None,
maximum=None,
exclusiveMinimum=None,
exclusiveMaximum=None,
multipleOf=None,
maxLength=None,
minLength=None,
additionalProperties=None,
anyOf=None,
patternProperties=None,
required=None,
enum=None,
title=None,
):
"""Creates a SchemaPlus with the given json-schema keys."""
self.type = type
self.properties = properties
self.items = items
self.description = description
self.minimum = minimum
self.maximum = maximum
self.exclusiveMinimum = exclusiveMinimum
self.exclusiveMaximum = exclusiveMaximum
self.multipleOf = multipleOf
self.maxLength = maxLength
self.minLength = minLength
self.anyOf = anyOf
self.format = format
self.additionalProperties = additionalProperties
self.patternProperties = patternProperties
self.required = required
self.enum = enum
self.title = title

def __str__(self):
return json.dumps(self.to_dict())
Jack-Burnett marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self):
pairs = [k + "=" + repr(v) for k, v in self.__dict__.items()]
args = ", ".join(pairs)
return "SchemaPlus(" + args + ")"

def __eq__(self, other):
return self.__dict__ == other.__dict__

def to_dict(self):
"""Return the raw JSON Schema as a (possibly nested) dict."""
result = {}

if self.properties is not None:
result["properties"] = {
k: v.to_dict()
for k, v in self.properties.items() # pylint: disable=no-member
Jack-Burnett marked this conversation as resolved.
Show resolved Hide resolved
}

if self.items is not None:
result["items"] = self.items.to_dict() # pylint: disable=no-member
Jack-Burnett marked this conversation as resolved.
Show resolved Hide resolved

for key in STANDARD_KEYS:
if self.__dict__.get(key) is not None:
result[key] = self.__dict__[key]

return result

@classmethod
def from_dict(cls, data, **schema_defaults):
"""Initialize a Schema object based on the JSON Schema structure.

:param schema_defaults: The default values to the Schema constructor.
"""
kwargs = schema_defaults.copy()
properties = data.get("properties")
items = data.get("items")

if properties is not None:
kwargs["properties"] = {
k: SchemaPlus.from_dict(v, **schema_defaults)
for k, v in properties.items()
}
if items is not None:
kwargs["items"] = SchemaPlus.from_dict(items, **schema_defaults)
for key in STANDARD_KEYS:
if key in data:
kwargs[key] = data[key]
return SchemaPlus(**kwargs)
7 changes: 4 additions & 3 deletions singer_sdk/helpers/_singer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from singer.catalog import Catalog as BaseCatalog
from singer.catalog import CatalogEntry as BaseCatalogEntry
from singer.schema import Schema

from singer_sdk.helpers._schema import SchemaPlus

Breadcrumb = Tuple[str, ...]

Expand Down Expand Up @@ -210,7 +211,7 @@ class CatalogEntry(BaseCatalogEntry):

tap_stream_id: str
metadata: MetadataMapping
schema: Schema
schema: SchemaPlus
stream: Optional[str] = None
key_properties: Optional[List[str]] = None
replication_key: Optional[str] = None
Expand All @@ -231,7 +232,7 @@ def from_dict(cls, stream: Dict[str, Any]):
key_properties=stream.get("key_properties"),
database=stream.get("database_name"),
table=stream.get("table_name"),
schema=Schema.from_dict(stream.get("schema", {})),
schema=SchemaPlus.from_dict(stream.get("schema", {})),
is_view=stream.get("is_view"),
stream_alias=stream.get("stream_alias"),
metadata=MetadataMapping.from_iterable(stream.get("metadata", [])),
Expand Down
6 changes: 3 additions & 3 deletions singer_sdk/streams/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
import pendulum
import requests
import singer
from singer import RecordMessage, SchemaMessage, StateMessage
from singer.schema import Schema
from singer import RecordMessage, Schema, SchemaMessage, StateMessage

from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException
from singer_sdk.helpers._catalog import pop_deselected_record_properties
from singer_sdk.helpers._compat import final
from singer_sdk.helpers._flattening import get_flattening_options
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.helpers._singer import (
Catalog,
CatalogEntry,
Expand Down Expand Up @@ -502,7 +502,7 @@ def _singer_catalog_entry(self) -> CatalogEntry:
return CatalogEntry(
tap_stream_id=self.tap_stream_id,
stream=self.name,
schema=Schema.from_dict(self.schema),
schema=SchemaPlus.from_dict(self.schema),
metadata=self.metadata,
key_properties=self.primary_keys or [],
replication_key=self.replication_key,
Expand Down
4 changes: 2 additions & 2 deletions singer_sdk/streams/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from functools import lru_cache
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, cast

import singer
import sqlalchemy
from sqlalchemy.engine import Engine
from sqlalchemy.engine.reflection import Inspector

from singer_sdk import typing as th
from singer_sdk.exceptions import ConfigValidationError
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.helpers._singer import CatalogEntry, MetadataMapping
from singer_sdk.plugin_base import PluginBase as TapBaseClass
from singer_sdk.streams.core import Stream
Expand Down Expand Up @@ -401,7 +401,7 @@ def discover_catalog_entry(
stream=unique_stream_id,
table=table_name,
key_properties=key_properties,
schema=singer.Schema.from_dict(schema),
schema=SchemaPlus.from_dict(schema),
is_view=is_view,
replication_method=replication_method,
metadata=MetadataMapping.get_standard_metadata(
Expand Down
3 changes: 2 additions & 1 deletion tests/core/test_catalog_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
get_selected_schema,
pop_deselected_record_properties,
)
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.typing import ObjectType, PropertiesList, Property, StringType


Expand Down Expand Up @@ -151,7 +152,7 @@ def catalog_entry_obj(schema, stream_name, selection_metadata) -> singer.Catalog
return singer.CatalogEntry(
tap_stream_id=stream_name,
stream=stream_name,
schema=singer.Schema.from_dict(schema),
schema=SchemaPlus.from_dict(schema),
metadata=singer.MetadataMapping.from_iterable(selection_metadata),
)

Expand Down
66 changes: 66 additions & 0 deletions tests/core/test_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Testing that SchemaPlus can convert schemas lossless from and to dicts.

Schemas are taken from these examples; https://json-schema.org/learn/miscellaneous-examples.html

NOTE: The following properties are not currently supported;
pattern
unevaluatedProperties
propertyNames
minProperties
maxProperties
prefixItems
contains
minContains
maxContains
minItems
maxItems
uniqueItems
enum
const
contentMediaType
contentEncoding
allOf
oneOf
not

Some of these could be trivially added (if they are SIMPLE_PROPERTIES.
Some might need more thinking if they can contain schemas (though, note that we also treat 'additionalProperties',
'anyOf' and' patternProperties' as SIMPLE even though they can contain schemas.
"""

from singer_sdk.helpers._schema import SchemaPlus


def test_simple_schema():
simple_schema = {
"title": "Longitude and Latitude Values",
"description": "A geographical coordinate.",
"required": ["latitude", "longitude"],
"type": "object",
"properties": {
"latitude": {"type": "number", "minimum": -90, "maximum": 90},
"longitude": {"type": "number", "minimum": -180, "maximum": 180},
},
}

schema_plus = SchemaPlus.from_dict(simple_schema)
assert schema_plus.to_dict() == simple_schema
assert schema_plus.required == ["latitude", "longitude"]
assert isinstance(schema_plus.properties["latitude"], SchemaPlus)
latitude = schema_plus.properties["latitude"]
assert latitude.type == "number"


def test_schema_with_items():
schema = {
"description": "A representation of a person, company, organization, or place",
"type": "object",
"properties": {"fruits": {"type": "array", "items": {"type": "string"}}},
}
schema_plus = SchemaPlus.from_dict(schema)
assert schema_plus.to_dict() == schema
assert isinstance(schema_plus.properties["fruits"], SchemaPlus)
fruits = schema_plus.properties["fruits"]
assert isinstance(fruits.items, SchemaPlus)
assert fruits.items.type == "string"