From 1bdaf81f4b76484972165597115ecf14e97389b2 Mon Sep 17 00:00:00 2001 From: Alex Liao Date: Thu, 4 May 2023 03:47:18 -0400 Subject: [PATCH 1/2] Updates to code --- .gitignore | 163 ++++++ bq_schema_generator/__init__.py | 7 +- bq_schema_generator/schema_generator.py | 203 +++++--- bq_schema_generator/utils.py | 27 - setup.py | 8 + tests/test_schema_generator.py | 632 ++++++++++++++++-------- 6 files changed, 736 insertions(+), 304 deletions(-) create mode 100644 .gitignore delete mode 100644 bq_schema_generator/utils.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dc2adbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +.DS_Store + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/bq_schema_generator/__init__.py b/bq_schema_generator/__init__.py index c1eb5de..5215291 100644 --- a/bq_schema_generator/__init__.py +++ b/bq_schema_generator/__init__.py @@ -1,10 +1,9 @@ - -from typing import List, Dict +from typing import Any from bq_schema_generator.schema_generator import SchemaGenerator -def batch_to_bq_schema(batch: List[List[Dict]]) -> List[Dict]: - """ Takes a batch of records retrieved from an API call and converts it to a list of dictionaries which can be +def batch_to_bq_schema(batch: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Takes a batch of records retrieved from an API call and converts it to a list of dictionaries which can be consumed by the Big Query API as a schema""" schema_generator = SchemaGenerator() schema_generator.update_schema_columns(batch) diff --git a/bq_schema_generator/schema_generator.py b/bq_schema_generator/schema_generator.py index da6c3a0..945359a 100644 --- a/bq_schema_generator/schema_generator.py +++ b/bq_schema_generator/schema_generator.py @@ -1,27 +1,66 @@ - -from typing import List, Dict, Union -from .utils import get_dict_value +from dataclasses import dataclass +from typing import Any, Dict, Optional, Union, Literal import logging -type_ = { +AllowedTypes = Union[dict, bool, float, int, str, list] +AllowedBQDataType = Literal["BOOLEAN", "INT", "FLOAT", "STRING", "RECORD", "NULL"] +BQMode = Literal["NULLABLE", "REQUIRED", "REPEATED"] + +INT_MODE_TYPE_TO_BQ_DATA_TYPE: dict[type, AllowedBQDataType] = { dict: "RECORD", bool: "BOOLEAN", float: "FLOAT", - int: "FLOAT", + int: "INT", str: "STRING", - list: None, - type(None): "NULL" + type(None): "NULL", +} +TYPE_TO_BQ_DATA_TYPE: dict[type, AllowedBQDataType] = { + **INT_MODE_TYPE_TO_BQ_DATA_TYPE, + int: "FLOAT", } # left side is loose, right side is precise/correct # i.e. right side can overwrite left side -type_hierarchy = { +INT_MODE_TYPE_HIERARCHY: dict[AllowedBQDataType, list[AllowedBQDataType]] = { "BOOLEAN": ["STRING", "RECORD"], + "INT": ["FLOAT", "STRING", "RECORD"], "FLOAT": ["STRING", "RECORD"], "STRING": ["RECORD"], - "RECORD": [] + "RECORD": [], } +TYPE_HIERARCHY = { + bq_data_type: hierarchy + for bq_data_type, hierarchy in INT_MODE_TYPE_HIERARCHY.items() + if bq_data_type != "INT" +} + + +@dataclass +class ElementSchema: + name: str + type: AllowedBQDataType + mode: BQMode + fields: Optional[tuple["ElementSchema"]] = None + + def is_dict(self) -> bool: + return self.type == "RECORD" and self.mode == "NULLABLE" + + def is_list(self) -> bool: + return self.mode == "REPEATED" + + def to_bq_dict(self) -> dict[str, Any]: + d: dict[str, Any] = { + "name": self.name, + "type": self.type, + "mode": self.mode, + } + + if self.fields: + d["fields"] = [f.to_bq_dict() for f in self.fields] + + return d + class SchemaGenerator(object): """ @@ -32,35 +71,50 @@ class SchemaGenerator(object): Legacy issue: all numbers are cast as float. """ - def __init__(self, - default_column_types: Dict = None): - self.schema_columns_dict = {} - self.bq_consumable_schema = [] - self.default_column_types = default_column_types if default_column_types is not None else {} + def __init__( + self, + default_column_types: dict[str, AllowedBQDataType] = {}, + use_int_in_hierarchy: bool = False, + ): + self.schema_columns_dict: dict[tuple[str, ...], ElementSchema] = {} + self.bq_consumable_schema: list[ElementSchema] = [] + self.default_column_types = default_column_types + self.type_hierarchy = ( + INT_MODE_TYPE_HIERARCHY if use_int_in_hierarchy else TYPE_HIERARCHY + ) + self.type_to_bq_data_type = ( + INT_MODE_TYPE_TO_BQ_DATA_TYPE + if use_int_in_hierarchy + else TYPE_TO_BQ_DATA_TYPE + ) - def _get_type(self, x: Union[dict, bool, float, int, str, list, type(None)]) -> Union[str, None]: + def _get_type(self, x: AllowedTypes) -> AllowedBQDataType: if isinstance(x, list): if len(x) > 0: - return type_[type(x[0])] + return self.type_to_bq_data_type[type(x[0])] else: - return None + raise Exception("Empty list") else: - return type_[type(x)] + return self.type_to_bq_data_type[type(x)] - def _get_mode(self, x: Union[dict, bool, float, int, str, list, type(None)]) -> Union[str, None]: + def _get_mode(self, x: AllowedTypes) -> BQMode: if isinstance(x, list): if len(x) > 0: return "REPEATED" else: - return None + raise Exception("Empty list") else: return "NULLABLE" - def _get_mode_type(self, x: Union[dict, bool, float, int, str, list, type(None)]) -> Dict: + def _get_mode_type(self, x: AllowedTypes) -> Dict: return {"mode": self._get_mode(x), "type": self._get_type(x)} - def _get_element_schema(self, elem_key: str, elem_value: Union[dict, bool, float, int, str, list, type(None)], - parent_elem_name: str) -> dict: + def _get_element_schema( + self, + elem_key: str, + elem_value: AllowedTypes, + parent_elem_name: Optional[tuple[str, ...]] = None, + ) -> Optional[ElementSchema]: """recursively updates a dictionary by checking an element's value for its mode and type, as well as updating the element's key name by appending it to its parents' with a separator' """ @@ -68,21 +122,37 @@ def _get_element_schema(self, elem_key: str, elem_value: Union[dict, bool, float if elem_value is None or elem_value == [] or elem_value == {}: return - # generate the element's name by joining it to its parents', separated by '.' character - elem_name = parent_elem_name+'.'+elem_key if parent_elem_name != '' else elem_key + # generate the element's name as a tuple of its parents' names and its own name + elem_name = ( + parent_elem_name + (elem_key,) + if parent_elem_name is not None + else (elem_key,) + ) # generate the element_schema dict by running get_mode_type on the elem_value, # and adding the elem_key as 'NAME' - element_schema = self._get_mode_type(elem_value) - element_schema["name"] = elem_key + element_schema = ElementSchema( + name=elem_key, + type=self._get_type(elem_value), + mode=self._get_mode(elem_value), + ) + + # if is dict or list[dict], recurse + if element_schema.type == "RECORD": + if element_schema.is_dict(): + # Just to make typechecker happy + if type(elem_value) != dict: + raise Exception("Should be dict") - # if is dict/list, do recursion - if element_schema["type"] == "RECORD": - if element_schema["mode"] == "NULLABLE": # is dict for element_key, element_value in elem_value.items(): self._get_element_schema(element_key, element_value, elem_name) - else: # is list + elif element_schema.is_list(): # is list of dict + if type(elem_value) != list: + raise Exception("Should be list") + for element in elem_value: + if type(element) != dict: + raise Exception("Should be dict") # whether its primitive or record, just recurse as the field will be overwritten in the schema for e_k, e_v in element.items(): self._get_element_schema(e_k, e_v, elem_name) @@ -92,33 +162,43 @@ def _get_element_schema(self, elem_key: str, elem_value: Union[dict, bool, float # update the schema_columns_dict with the element_schema dict and return it self._update_schema_columns_dict(elem_name, element_schema) - def _update_schema_columns_dict(self, elem_name: str, element_schema: dict): - """ Precision based datatype hiearchy + def _update_schema_columns_dict( + self, elem_name: tuple[str, ...], element_schema: ElementSchema + ): + """Precision based datatype hiearchy Determines whether to use the incoming record's schema or re-use current one Looser types cannot be replaced by more precise types e.g. null/none > string > float > int """ - incoming_type = element_schema["type"] - curr_type = get_dict_value(self.schema_columns_dict, elem_name+".type").value - if incoming_type in type_hierarchy: - if get_dict_value(self.schema_columns_dict, elem_name+".type").value is None: + incoming_type = element_schema.type + curr_type = ( + self.schema_columns_dict[elem_name].type + if elem_name in self.schema_columns_dict + else None + ) + if incoming_type in self.type_hierarchy: + if curr_type is None: self.schema_columns_dict[elem_name] = element_schema - elif incoming_type in type_hierarchy[curr_type]: + elif incoming_type in self.type_hierarchy[curr_type]: self.schema_columns_dict[elem_name] = element_schema else: pass # do nth else: - raise Exception(f"Unknown datatype: {incoming_type}, for column: {elem_name}, \ - found when parsing schema of records... terminating...") + raise Exception( + f"Unknown datatype: {incoming_type}, for column: {elem_name}, \ + found when parsing schema of records... terminating..." + ) - def _get_record_schema(self, record: List[Dict]) -> None: + def _get_record_schema(self, record: dict[str, Any]) -> None: """Iterates through a list of dictionaries, checking each key-value pair for the value type, and updating self.schema_columns_dict with the relevant information""" for key, value in record.items(): - self._get_element_schema(key, value, '') + self._get_element_schema(key, value) - def _construct_nesting_dict(self, schema_columns_dict: dict) -> dict: + def _construct_nesting_dict( + self, schema_columns_dict: dict[tuple[str, ...], ElementSchema] + ) -> dict[int, list[tuple[str, ...]]]: """Iterates through a schema dictionary checking how many levels each schema column is nested (based on the number of separators in the schema column name, eg 'k_1.kk_2' = 2) Stores this information in a dictionary (nesting_dict) with 'key' = no. of levels, and 'value' = a list of @@ -132,15 +212,13 @@ def _construct_nesting_dict(self, schema_columns_dict: dict) -> dict: return nesting_dict for name in schema_columns_dict: - elems = name.split(".") - if len(elems) in nesting_dict: - nesting_dict[len(elems)].append(name) - else: - nesting_dict[len(elems)] = [name] + nesting_dict.setdefault(len(name), []).append(name) return nesting_dict - def _construct_bq_schema(self, schema_columns_dict: dict) -> None: + def _construct_bq_schema( + self, schema_columns_dict: dict[tuple[str, ...], ElementSchema] + ) -> None: """Takes a schema_columns_dict object and constructs a list of dictionaries which can be consumed by the Big Query API as a schema """ @@ -153,16 +231,17 @@ def _construct_bq_schema(self, schema_columns_dict: dict) -> None: # iterate over the schema keys of that nesting level for schema_key in nesting_dict[nest_level]: # A column of type "RECORD" must have fields, ie any empty dictionaries are removed - if schema_columns_dict[schema_key]["type"] == "RECORD": - if "fields" not in schema_columns_dict[schema_key]: + if schema_columns_dict[schema_key].type == "RECORD": + if schema_columns_dict[schema_key].fields is None: schema_columns_dict.pop(schema_key) continue - parent = ".".join(schema_key.split(".")[:-1]) + parent_key = schema_key[:-1] + parent = schema_columns_dict[parent_key] # Update the parent column's fields with the present column - if "fields" in schema_columns_dict[parent]: - schema_columns_dict[parent]["fields"].append(schema_columns_dict[schema_key]) - else: - schema_columns_dict[parent]["fields"] = [(schema_columns_dict[schema_key])] + if parent.fields is None: + parent.fields = tuple() + + parent.fields = parent.fields + (schema_columns_dict[schema_key],) # Pop the present column from the dictionary schema_columns_dict.pop(schema_key) @@ -171,21 +250,21 @@ def _construct_bq_schema(self, schema_columns_dict: dict) -> None: # Append these to the final_schema list for col_name, schema_column in schema_columns_dict.items(): # check if 'schema_column' is a "RECORD" type without "fields", skip if so - if schema_column["type"] == "RECORD": - if "fields" not in schema_column: + if schema_column.type == "RECORD": + if schema_column.fields is None: continue # check if a default type has been specified for this if col_name in self.default_column_types: - schema_column["type"] = self.default_column_types[col_name] + schema_column.type = self.default_column_types[col_name] self.bq_consumable_schema.append(schema_column) - def update_schema_columns(self, batch: List[List[Dict]]) -> None: + def update_schema_columns(self, batch: list[dict[str, Any]]) -> None: """Iterates through the records of a batch and updates self.schema_columns_dict with relevant information required to construct a Big Query schema""" for record in batch: self._get_record_schema(record) - def get_bq_schema(self): + def get_bq_schema(self) -> list[dict[str, Any]]: self._construct_bq_schema(self.schema_columns_dict) - return self.bq_consumable_schema + return [item.to_bq_dict() for item in self.bq_consumable_schema] diff --git a/bq_schema_generator/utils.py b/bq_schema_generator/utils.py deleted file mode 100644 index 10a8789..0000000 --- a/bq_schema_generator/utils.py +++ /dev/null @@ -1,27 +0,0 @@ - -from types import SimpleNamespace - - -def _get_dict_value(_obj, _fields): - if _obj is None: - return SimpleNamespace(exist=False, value=None, error=None) - if not isinstance(_obj, dict): - return SimpleNamespace(exist=False, value=None, error="not_a_dict") - - if len(_fields) == 1: - exist = _fields[0] in _obj - value = _obj.get(_fields[0]) - return SimpleNamespace(exist=exist, value=value, error=None) - else: - return _get_dict_value(_obj.get(_fields[0]), _fields[1:]) - - -def get_dict_value(obj, field): - """Get a dict value from obj - - field can be in the form "listing.price" - - return SimpleNamespace which contains exist, value and error - """ - fields = field.split(".") - return _get_dict_value(obj, fields) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f392dfe --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup, find_packages + +setup( + name="bq_schema_generator", + packages=find_packages(), + version="0.1", + license="MIT", +) diff --git a/tests/test_schema_generator.py b/tests/test_schema_generator.py index 3239d82..e92629c 100644 --- a/tests/test_schema_generator.py +++ b/tests/test_schema_generator.py @@ -1,141 +1,270 @@ import pytest import json -from bq_schema_generator.schema_generator import SchemaGenerator +from bq_schema_generator.schema_generator import ElementSchema, SchemaGenerator from bq_schema_generator import batch_to_bq_schema schema_generator = SchemaGenerator() -@pytest.mark.parametrize("value, expected_result", [("s_1", "STRING"), - ("345", "STRING")]) + +@pytest.mark.parametrize( + "value, expected_result", [("s_1", "STRING"), ("345", "STRING")] +) def test_get_type_string(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [(True, "BOOLEAN"), - (False, "BOOLEAN")]) + +@pytest.mark.parametrize( + "value, expected_result", [(True, "BOOLEAN"), (False, "BOOLEAN")] +) def test_get_type_boolean(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [(3.1234, "FLOAT"), - (-0.999, "FLOAT"), - (1.0, "FLOAT")]) + +@pytest.mark.parametrize( + "value, expected_result", [(3.1234, "FLOAT"), (-0.999, "FLOAT"), (1.0, "FLOAT")] +) def test_get_type_float(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [(3, "FLOAT"), - (-1, "FLOAT"), - (0, "FLOAT"), ]) + +@pytest.mark.parametrize( + "value, expected_result", + [ + (3, "FLOAT"), + (-1, "FLOAT"), + (0, "FLOAT"), + ], +) def test_get_type_integer(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [({}, "RECORD"), - ({"x_1": 123}, "RECORD"), - ({"x_2": "y_1", "x_3": "y_2"}, "RECORD")]) + +@pytest.mark.parametrize( + "value, expected_result", + [ + ({}, "RECORD"), + ({"x_1": 123}, "RECORD"), + ({"x_2": "y_1", "x_3": "y_2"}, "RECORD"), + ], +) def test_get_type_record(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [([], None), - ([1, 2, 3], "FLOAT"), - (["a", 1, 2], "STRING")]) + +@pytest.mark.parametrize( + "value, expected_result", + [([1, 2, 3], "FLOAT"), (["a", 1, 2], "STRING")], +) def test_get_type_list(value, expected_result): assert schema_generator._get_type(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [(123, "NULLABLE"), ('abc', "NULLABLE"), (None, "NULLABLE"), - ([], None), - ([[]], "REPEATED"), ([1, 2, 3], "REPEATED"), ([1.43, 0, None, True], - "REPEATED")]) + +@pytest.mark.parametrize( + "value, expected_result", + [ + (123, "NULLABLE"), + ("abc", "NULLABLE"), + (None, "NULLABLE"), + ([[]], "REPEATED"), + ([1, 2, 3], "REPEATED"), + ([1.43, 0, None, True], "REPEATED"), + ], +) def test_get_mode(value, expected_result): assert schema_generator._get_mode(value) == expected_result -@pytest.mark.parametrize("value, expected_result", [("t_1", {"mode": "NULLABLE", "type": "STRING"}), - (-0.000, {"mode": "NULLABLE", "type": "FLOAT"}), - (True, {"mode": "NULLABLE", "type": "BOOLEAN"}), - ([1, 2, 3], {"mode": "REPEATED", "type": "FLOAT"}), - ([{"x_1": "y_1", "x_2": "y_2"}, {"x_3": "y_3", "x_4": "y_4"}], {"mode": "REPEATED", "type": "RECORD"}), - ({"x_2": "y_1", "x_3": "y_2"}, {"mode": "NULLABLE", "type": "RECORD"}), - ({}, {"mode": "NULLABLE", "type": "RECORD"}), - ([], {"mode": None, "type": None}), ]) + +@pytest.mark.parametrize( + "value, expected_result", + [ + ("t_1", {"mode": "NULLABLE", "type": "STRING"}), + (-0.000, {"mode": "NULLABLE", "type": "FLOAT"}), + (True, {"mode": "NULLABLE", "type": "BOOLEAN"}), + ([1, 2, 3], {"mode": "REPEATED", "type": "FLOAT"}), + ( + [{"x_1": "y_1", "x_2": "y_2"}, {"x_3": "y_3", "x_4": "y_4"}], + {"mode": "REPEATED", "type": "RECORD"}, + ), + ({"x_2": "y_1", "x_3": "y_2"}, {"mode": "NULLABLE", "type": "RECORD"}), + ({}, {"mode": "NULLABLE", "type": "RECORD"}), + ], +) def test_get_mode_type(value, expected_result): assert schema_generator._get_mode_type(value) == expected_result + """ TEST get_element_schema """ - value_schema_dict_1 = {} -value_elem_key_1 = 'k_1' +value_elem_key_1 = "k_1" value_elem_value_1 = 1000 -value_parents_1 = '' -expected_r_element_schema_1 = {"k_1": {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}} - -value_schema_dict_2 = {'p_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'p_1'}} -value_elem_key_2 = 'k_1' +value_parents_1 = None +expected_r_element_schema_1 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT") +} + +value_schema_dict_2 = { + ("p_1",): ElementSchema(name="p_1", mode="NULLABLE", type="FLOAT") +} +value_elem_key_2 = "k_1" value_elem_value_2 = True -value_parents_2 = 'p_1' -expected_r_element_schema_2 = {'p_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'p_1'}, - 'p_1.k_1': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_1'}} +value_parents_2 = ("p_1",) +expected_r_element_schema_2 = { + ("p_1",): ElementSchema(name="p_1", mode="NULLABLE", type="FLOAT"), + ( + "p_1", + "k_1", + ): ElementSchema(name="k_1", mode="NULLABLE", type="BOOLEAN"), +} value_schema_dict_3 = {} -value_elem_key_3 = 'k_1' -value_elem_value_3 = {'kk_1': 'TEST', 'kk_2': 1000, 'kk_3': [1, 2, 3, 4], 'kk_4': [{'kkk_1': True}, {'kkk_2': False}]} -value_parents_3 = '' -expected_r_element_schema_3 = {'k_1': {'mode': 'NULLABLE', 'type': 'RECORD', 'name': 'k_1'}, - 'k_1.kk_1': {'mode': 'NULLABLE', 'type': 'STRING', 'name': 'kk_1'}, - 'k_1.kk_2': {'mode': 'NULLABLE', 'type': 'FLOAT', 'name': 'kk_2'}, - 'k_1.kk_3': {'mode': 'REPEATED', 'type': 'FLOAT', 'name': 'kk_3'}, - 'k_1.kk_4': {'mode': 'REPEATED', 'type': 'RECORD', 'name': 'kk_4'}, - 'k_1.kk_4.kkk_1': {'mode': 'NULLABLE', 'type': 'BOOLEAN', 'name': 'kkk_1'}, - 'k_1.kk_4.kkk_2': {'mode': 'NULLABLE', 'type': 'BOOLEAN', 'name': 'kkk_2'} - } +value_elem_key_3 = "k_1" +value_elem_value_3 = { + "kk_1": "TEST", + "kk_2": 1000, + "kk_3": [1, 2, 3, 4], + "kk_4": [{"kkk_1": True}, {"kkk_2": False}], +} +value_parents_3 = None +expected_r_element_schema_3 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="RECORD"), + ( + "k_1", + "kk_1", + ): ElementSchema(name="kk_1", mode="NULLABLE", type="STRING"), + ( + "k_1", + "kk_2", + ): ElementSchema(name="kk_2", mode="NULLABLE", type="FLOAT"), + ( + "k_1", + "kk_3", + ): ElementSchema(name="kk_3", mode="REPEATED", type="FLOAT"), + ( + "k_1", + "kk_4", + ): ElementSchema(name="kk_4", mode="REPEATED", type="RECORD"), + ( + "k_1", + "kk_4", + "kkk_1", + ): ElementSchema(name="kkk_1", mode="NULLABLE", type="BOOLEAN"), + ( + "k_1", + "kk_4", + "kkk_2", + ): ElementSchema(name="kkk_2", mode="NULLABLE", type="BOOLEAN"), +} value_schema_dict_4 = {} -value_elem_key_4 = 'k_1' -value_elem_value_4 = {'kk_1': [{'kkk_1': True}, - {'kkk_2': False}, - {'kkk_3': [ - {'kkkk_1': 10}, - {'kkkk_2': 5} - ]} - ]} -value_parents_4 = '' -expected_r_element_schema_4 = {'k_1': {'mode': 'NULLABLE', 'type': 'RECORD', 'name': 'k_1'}, - 'k_1.kk_1': {'mode': 'REPEATED', 'type': 'RECORD', 'name': 'kk_1'}, - 'k_1.kk_1.kkk_1': {'mode': 'NULLABLE', 'type': 'BOOLEAN', 'name': 'kkk_1'}, - 'k_1.kk_1.kkk_2': {'mode': 'NULLABLE', 'type': 'BOOLEAN', 'name': 'kkk_2'}, - 'k_1.kk_1.kkk_3': {'mode': 'REPEATED', 'type': 'RECORD', 'name': 'kkk_3'}, - 'k_1.kk_1.kkk_3.kkkk_1': {'mode': 'NULLABLE', 'type': 'FLOAT', 'name': 'kkkk_1'}, - 'k_1.kk_1.kkk_3.kkkk_2': {'mode': 'NULLABLE', 'type': 'FLOAT', 'name': 'kkkk_2'}, - } +value_elem_key_4 = "k_1" +value_elem_value_4 = { + "kk_1": [ + {"kkk_1": True}, + {"kkk_2": False}, + {"kkk_3": [{"kkkk_1": 10}, {"kkkk_2": 5}]}, + ] +} +value_parents_4 = None +expected_r_element_schema_4 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="RECORD"), + ( + "k_1", + "kk_1", + ): ElementSchema(name="kk_1", mode="REPEATED", type="RECORD"), + ( + "k_1", + "kk_1", + "kkk_1", + ): ElementSchema(name="kkk_1", mode="NULLABLE", type="BOOLEAN"), + ( + "k_1", + "kk_1", + "kkk_2", + ): ElementSchema(name="kkk_2", mode="NULLABLE", type="BOOLEAN"), + ( + "k_1", + "kk_1", + "kkk_3", + ): ElementSchema(name="kkk_3", mode="REPEATED", type="RECORD"), + ( + "k_1", + "kk_1", + "kkk_3", + "kkkk_1", + ): ElementSchema(name="kkkk_1", mode="NULLABLE", type="FLOAT"), + ( + "k_1", + "kk_1", + "kkk_3", + "kkkk_2", + ): ElementSchema(name="kkkk_2", mode="NULLABLE", type="FLOAT"), +} value_schema_dict_5 = {} -value_elem_key_5 = 'k_1' -value_elem_value_5 = {'kk_1': [], - 'kk_2': {}, - 'kk_3': {'kkk_1': {}, - 'kkk_2': []}} -value_parents_5 = '' -expected_r_element_schema_5 = {'k_1': {'mode': 'NULLABLE', 'name': 'k_1', 'type': 'RECORD'}, - 'k_1.kk_3': {'mode': 'NULLABLE', 'name': 'kk_3', 'type': 'RECORD'}} - -@pytest.mark.parametrize("value_schema_dict, value_elem_k, value_elem_v, value_parents, " - "expected_result", - [(value_schema_dict_1, value_elem_key_1, value_elem_value_1, value_parents_1, - expected_r_element_schema_1), - (value_schema_dict_2, value_elem_key_2, value_elem_value_2, value_parents_2, - expected_r_element_schema_2), - (value_schema_dict_3, value_elem_key_3, value_elem_value_3, value_parents_3, - expected_r_element_schema_3), - (value_schema_dict_4, value_elem_key_4, value_elem_value_4, value_parents_4, - expected_r_element_schema_4), - (value_schema_dict_5, value_elem_key_5, value_elem_value_5, value_parents_5, - expected_r_element_schema_5), - ]) -def test_get_element_schema(value_schema_dict, value_elem_k, value_elem_v, value_parents, expected_result): +value_elem_key_5 = "k_1" +value_elem_value_5 = {"kk_1": [], "kk_2": {}, "kk_3": {"kkk_1": {}, "kkk_2": []}} +value_parents_5 = None +expected_r_element_schema_5 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="RECORD"), + ( + "k_1", + "kk_3", + ): ElementSchema(name="kk_3", mode="NULLABLE", type="RECORD"), +} + + +@pytest.mark.parametrize( + "value_schema_dict, value_elem_k, value_elem_v, value_parents, " "expected_result", + [ + ( + value_schema_dict_1, + value_elem_key_1, + value_elem_value_1, + value_parents_1, + expected_r_element_schema_1, + ), + ( + value_schema_dict_2, + value_elem_key_2, + value_elem_value_2, + value_parents_2, + expected_r_element_schema_2, + ), + ( + value_schema_dict_3, + value_elem_key_3, + value_elem_value_3, + value_parents_3, + expected_r_element_schema_3, + ), + ( + value_schema_dict_4, + value_elem_key_4, + value_elem_value_4, + value_parents_4, + expected_r_element_schema_4, + ), + ( + value_schema_dict_5, + value_elem_key_5, + value_elem_value_5, + value_parents_5, + expected_r_element_schema_5, + ), + ], +) +def test_get_element_schema( + value_schema_dict, value_elem_k, value_elem_v, value_parents, expected_result +): schema_generator = SchemaGenerator() schema_generator.schema_columns_dict = value_schema_dict schema_generator._get_element_schema(value_elem_k, value_elem_v, value_parents) assert schema_generator.schema_columns_dict == expected_result + """ TEST update_schema_columns """ @@ -143,171 +272,252 @@ def test_get_element_schema(value_schema_dict, value_elem_k, value_elem_v, value schema_dict = dict() value_batch_schema_1 = [{"k_1": 10}] -expected_r_batch_schema_1 = {'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}} +expected_r_batch_schema_1 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT") +} value_batch_schema_2 = [{"k_1": 10, "k_2": True, "k_3": "test"}, {"k_1": 5, "k_4": 1.4}] -expected_r_batch_schema_2 = {'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - 'k_2': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_2'}, - 'k_3': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'k_3'}, - 'k_4': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_4'}} - -value_batch_schema_3 = [{"k_1": 10, - "k_2": True, - "k_3": { - "kk_1": 45, - "kk_2": "test", - }}, - {"k_1": 5, - "k_2": False, - "k_3": { - "kk_1": 2, - "kk_2": "testing", - "kk_3": False - }}] -expected_r_batch_schema_3 = {'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - 'k_2': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_2'}, - 'k_3': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_3'}, - 'k_3.kk_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - 'k_3.kk_2': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - 'k_3.kk_3': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'kk_3'} - } - -value_batch_schema_4 = [{"k_1": 10, - "k_2": False, - "k_3": "10b", - "k_4": None, - "k_5": "string", - "k_6": {"kk_3": 10} - }, - {"k_1": "10a", - "k_2": "False", - "k_3": 10, - "k_4": {"kk_1": 2}, - "k_5": {"kk_2": "hello"}, - "k_6": {"kk_3": "10c"} - }] -expected_r_batch_schema_4 = {'k_1': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'k_1'}, - 'k_2': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'k_2'}, - 'k_3': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'k_3'}, - 'k_4': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_4'}, - 'k_4.kk_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - 'k_5': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_5'}, - 'k_5.kk_2': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - 'k_6': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_6'}, - 'k_6.kk_3': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_3'}, - } +expected_r_batch_schema_2 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT"), + ("k_2",): ElementSchema(name="k_2", mode="NULLABLE", type="BOOLEAN"), + ("k_3",): ElementSchema(name="k_3", mode="NULLABLE", type="STRING"), + ("k_4",): ElementSchema(name="k_4", mode="NULLABLE", type="FLOAT"), +} + +value_batch_schema_3 = [ + { + "k_1": 10, + "k_2": True, + "k_3": { + "kk_1": 45, + "kk_2": "test", + }, + }, + {"k_1": 5, "k_2": False, "k_3": {"kk_1": 2, "kk_2": "testing", "kk_3": False}}, +] +expected_r_batch_schema_3 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT"), + ("k_2",): ElementSchema(name="k_2", mode="NULLABLE", type="BOOLEAN"), + ("k_3",): ElementSchema(name="k_3", mode="NULLABLE", type="RECORD"), + ("k_3", "kk_1"): ElementSchema(name="kk_1", mode="NULLABLE", type="FLOAT"), + ("k_3", "kk_2"): ElementSchema(name="kk_2", mode="NULLABLE", type="STRING"), + ("k_3", "kk_3"): ElementSchema(name="kk_3", mode="NULLABLE", type="BOOLEAN"), +} + +value_batch_schema_4 = [ + { + "k_1": 10, + "k_2": False, + "k_3": "10b", + "k_4": None, + "k_5": "string", + "k_6": {"kk_3": 10}, + }, + { + "k_1": "10a", + "k_2": "False", + "k_3": 10, + "k_4": {"kk_1": 2}, + "k_5": {"kk_2": "hello"}, + "k_6": {"kk_3": "10c"}, + }, +] +expected_r_batch_schema_4 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="STRING"), + ("k_2",): ElementSchema(name="k_2", mode="NULLABLE", type="STRING"), + ("k_3",): ElementSchema(name="k_3", mode="NULLABLE", type="STRING"), + ("k_4",): ElementSchema(name="k_4", mode="NULLABLE", type="RECORD"), + ("k_4", "kk_1"): ElementSchema(name="kk_1", mode="NULLABLE", type="FLOAT"), + ("k_5",): ElementSchema(name="k_5", mode="NULLABLE", type="RECORD"), + ("k_5", "kk_2"): ElementSchema(name="kk_2", mode="NULLABLE", type="STRING"), + ("k_6",): ElementSchema(name="k_6", mode="NULLABLE", type="RECORD"), + ("k_6", "kk_3"): ElementSchema(name="kk_3", mode="NULLABLE", type="STRING"), +} value_batch_schema_5 = [{"k_1": None}] -expected_r_batch_schema_5 = {} # func returns immediately - -@pytest.mark.parametrize("value, expected_result", [(value_batch_schema_1, expected_r_batch_schema_1), - (value_batch_schema_2, expected_r_batch_schema_2), - (value_batch_schema_3, expected_r_batch_schema_3), - (value_batch_schema_4, expected_r_batch_schema_4), - (value_batch_schema_5, expected_r_batch_schema_5)]) +expected_r_batch_schema_5 = {} # func returns immediately + + +@pytest.mark.parametrize( + "value, expected_result", + [ + (value_batch_schema_1, expected_r_batch_schema_1), + (value_batch_schema_2, expected_r_batch_schema_2), + (value_batch_schema_3, expected_r_batch_schema_3), + (value_batch_schema_4, expected_r_batch_schema_4), + (value_batch_schema_5, expected_r_batch_schema_5), + ], +) def test_get_batch_schema(value, expected_result): schema_generator = SchemaGenerator() schema_generator.update_schema_columns(value) assert schema_generator.schema_columns_dict == expected_result + """ TEST update_schema_columns_dict """ -@pytest.mark.parametrize("value", [("NULL"), \ - (None)]) + + +@pytest.mark.parametrize("value", [("NULL"), (None)]) def test_update_schema_columns_dict_exception(value): schema_generator = SchemaGenerator() - elem_name = 'k_1' - element_schema = {"mode": "NULLABLE", "type": value} + elem_name = ("k_1",) + element_schema = ElementSchema(name="k_1", mode="NULLABLE", type=value) with pytest.raises(Exception) as e_info: schema_generator._update_schema_columns_dict(elem_name, element_schema) + """ TEST construct_nesting_dict """ -value_nesting_dict_1 = {'col_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'col_1'}, - 'col_2': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'col_2'}, - 'col_1.col_3': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'col_1.col_3'}, - 'col_1.col_3.col_5': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'col_1.col_3.col_5'}, - 'col_1.col_4': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'col_1.col_4'} - } - -expected_r_nesting_dict_1 = {1: ['col_1', 'col_2'], 2: ['col_1.col_3', 'col_1.col_4'], 3: ['col_1.col_3.col_5']} -@pytest.mark.parametrize("value, expected_result", [(value_nesting_dict_1, expected_r_nesting_dict_1)]) +value_nesting_dict_1 = { + ("col_1",): {"mode": "NULLABLE", "type": "FLOAT", "name": "col_1"}, + ("col_2",): {"mode": "NULLABLE", "type": "BOOLEAN", "name": "col_2"}, + ("col_1", "col_3"): {"mode": "NULLABLE", "type": "RECORD", "name": "col_1.col_3"}, + ("col_1", "col_3", "col_5"): { + "mode": "NULLABLE", + "type": "FLOAT", + "name": "col_1.col_3.col_5", + }, + ("col_1", "col_4"): {"mode": "NULLABLE", "type": "STRING", "name": "col_1.col_4"}, +} + +expected_r_nesting_dict_1 = { + 1: [("col_1",), ("col_2",)], + 2: [("col_1", "col_3"), ("col_1", "col_4")], + 3: [("col_1", "col_3", "col_5")], +} + + +@pytest.mark.parametrize( + "value, expected_result", [(value_nesting_dict_1, expected_r_nesting_dict_1)] +) def test_construct_nesting_dict(value, expected_result): assert schema_generator._construct_nesting_dict(value) == expected_result + """ TEST construct_bq_schema """ -value_construct_bq_schema_1 = {'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}} -expected_r_construct_bq_schema_1 = [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}] - -value_construct_bq_schema_2 = { 'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - 'k_2': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_2'}, - 'k_3': {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_3'}, - 'k_3.kk_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - 'k_3.kk_2': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - 'k_3.kk_3': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'kk_3'} - } -expected_r_construct_bq_schema_2 = [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_2'}, - {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_3', 'fields': - [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'kk_3'}] - }] -value_construct_bq_schema_3 = { 'k_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - 'k_3.kk_1': {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - 'k_2': {'mode': 'NULLABLE', 'type': 'RECORD', 'name': 'k_2'}, - 'k_3': {'mode': 'NULLABLE', 'type': 'RECORD', 'name': 'k_3'}, - 'k_2.kk_4.kkk_1' : {'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kkk_1'}, - 'k_3.kk_2': {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - 'k_3.kk_3': {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'kk_3'}, - 'k_2.kk_4': {'mode': 'NULLABLE', 'type': 'RECORD', 'name': 'kk_4'}, - } -expected_r_construct_bq_schema_3 = [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'k_1'}, - {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_2', 'fields': - [{'mode': 'NULLABLE', 'type': "RECORD", 'name': 'kk_4', "fields": - [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kkk_1'}]}]}, - {'mode': 'NULLABLE', 'type': "RECORD", 'name': 'k_3', 'fields': - [{'mode': 'NULLABLE', 'type': "FLOAT", 'name': 'kk_1'}, - {'mode': 'NULLABLE', 'type': "STRING", 'name': 'kk_2'}, - {'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'kk_3'}] - }] - -value_construct_bq_schema_4 = {'k_1': {'mode': 'NULLABLE', 'name': 'k_1', 'type': 'RECORD'}, - 'k_1.kk_3': {'mode': 'NULLABLE', 'name': 'k_1.kk_3', 'type': 'RECORD'}} +value_construct_bq_schema_1 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT") +} +expected_r_construct_bq_schema_1 = [ + {"mode": "NULLABLE", "type": "FLOAT", "name": "k_1"} +] + +value_construct_bq_schema_2 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT"), + ("k_2",): ElementSchema(name="k_2", mode="NULLABLE", type="BOOLEAN"), + ("k_3",): ElementSchema(name="k_3", mode="NULLABLE", type="RECORD"), + ("k_3", "kk_1"): ElementSchema(name="kk_1", mode="NULLABLE", type="FLOAT"), + ("k_3", "kk_2"): ElementSchema(name="kk_2", mode="NULLABLE", type="STRING"), + ("k_3", "kk_3"): ElementSchema(name="kk_3", mode="NULLABLE", type="BOOLEAN"), +} +expected_r_construct_bq_schema_2 = [ + {"mode": "NULLABLE", "type": "FLOAT", "name": "k_1"}, + {"mode": "NULLABLE", "type": "BOOLEAN", "name": "k_2"}, + { + "mode": "NULLABLE", + "type": "RECORD", + "name": "k_3", + "fields": [ + {"mode": "NULLABLE", "type": "FLOAT", "name": "kk_1"}, + {"mode": "NULLABLE", "type": "STRING", "name": "kk_2"}, + {"mode": "NULLABLE", "type": "BOOLEAN", "name": "kk_3"}, + ], + }, +] +value_construct_bq_schema_3 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="FLOAT"), + ("k_3", "kk_1"): ElementSchema(name="kk_1", mode="NULLABLE", type="FLOAT"), + ("k_2",): ElementSchema(name="k_2", mode="NULLABLE", type="RECORD"), + ("k_3",): ElementSchema(name="k_3", mode="NULLABLE", type="RECORD"), + ("k_2", "kk_4", "kkk_1"): ElementSchema( + name="kkk_1", mode="NULLABLE", type="FLOAT" + ), + ("k_3", "kk_2"): ElementSchema(name="kk_2", mode="NULLABLE", type="STRING"), + ("k_3", "kk_3"): ElementSchema(name="kk_3", mode="NULLABLE", type="BOOLEAN"), + ("k_2", "kk_4"): ElementSchema(name="kk_4", mode="NULLABLE", type="RECORD"), +} +expected_r_construct_bq_schema_3 = [ + {"mode": "NULLABLE", "type": "FLOAT", "name": "k_1"}, + { + "mode": "NULLABLE", + "type": "RECORD", + "name": "k_2", + "fields": [ + { + "mode": "NULLABLE", + "type": "RECORD", + "name": "kk_4", + "fields": [{"mode": "NULLABLE", "type": "FLOAT", "name": "kkk_1"}], + } + ], + }, + { + "mode": "NULLABLE", + "type": "RECORD", + "name": "k_3", + "fields": [ + {"mode": "NULLABLE", "type": "FLOAT", "name": "kk_1"}, + {"mode": "NULLABLE", "type": "STRING", "name": "kk_2"}, + {"mode": "NULLABLE", "type": "BOOLEAN", "name": "kk_3"}, + ], + }, +] + +value_construct_bq_schema_4 = { + ("k_1",): ElementSchema(name="k_1", mode="NULLABLE", type="RECORD"), + ("k_1", "kk_3"): ElementSchema(name="k_1.kk_3", mode="NULLABLE", type="RECORD"), +} expected_r_construct_bq_schema_4 = [] -@pytest.mark.parametrize("value, expected_result", [(value_construct_bq_schema_1, expected_r_construct_bq_schema_1), - (value_construct_bq_schema_2, expected_r_construct_bq_schema_2), - (value_construct_bq_schema_3, expected_r_construct_bq_schema_3), - (value_construct_bq_schema_4, expected_r_construct_bq_schema_4)]) + +@pytest.mark.parametrize( + "value, expected_result", + [ + (value_construct_bq_schema_1, expected_r_construct_bq_schema_1), + (value_construct_bq_schema_2, expected_r_construct_bq_schema_2), + (value_construct_bq_schema_3, expected_r_construct_bq_schema_3), + (value_construct_bq_schema_4, expected_r_construct_bq_schema_4), + ], +) def test_construct_bq_schema(value, expected_result): schema_generator = SchemaGenerator() schema_generator._construct_bq_schema(value) - assert schema_generator.bq_consumable_schema == expected_result + assert [ + item.to_bq_dict() for item in schema_generator.bq_consumable_schema + ] == expected_result + """ TEST batch_to_bq_schema """ # WIP. change the test data to something that has no context -value_batch_to_bq_schema_0 = [{'k_1': []}, {'k_2': True}] -expected_r_batch_to_bq_schema_0 = [{'mode': 'NULLABLE', 'type': "BOOLEAN", 'name': 'k_2'}] -with open('tests/test_data.json', 'r') as f: +value_batch_to_bq_schema_0 = [{"k_1": []}, {"k_2": True}] +expected_r_batch_to_bq_schema_0 = [ + {"mode": "NULLABLE", "type": "BOOLEAN", "name": "k_2"} +] +with open("tests/test_data.json", "r") as f: test_values = json.load(f) value_batch_to_bq_schema_1 = test_values["value_batch_to_bq_schema_1"] -expected_r_batch_to_bq_schema_1= test_values["expected_r_batch_to_bq_schema_1"] +expected_r_batch_to_bq_schema_1 = test_values["expected_r_batch_to_bq_schema_1"] value_batch_to_bq_schema_2 = test_values["value_batch_to_bq_schema_2"] expected_r_batch_to_bq_schema_2 = test_values["expected_r_batch_to_bq_schema_2"] -@pytest.mark.parametrize("value, expected_result", [(value_batch_to_bq_schema_0, expected_r_batch_to_bq_schema_0), - (value_batch_to_bq_schema_1, expected_r_batch_to_bq_schema_1), - (value_batch_to_bq_schema_2, expected_r_batch_to_bq_schema_2)]) +@pytest.mark.parametrize( + "value, expected_result", + [ + (value_batch_to_bq_schema_0, expected_r_batch_to_bq_schema_0), + (value_batch_to_bq_schema_1, expected_r_batch_to_bq_schema_1), + (value_batch_to_bq_schema_2, expected_r_batch_to_bq_schema_2), + ], +) def test_batch_to_bq_schema(value, expected_result): assert batch_to_bq_schema(value) == expected_result From 8f570d31b69c0a003ddee32adb6d765365b4105e Mon Sep 17 00:00:00 2001 From: Alex Liao Date: Thu, 4 May 2023 04:42:06 -0400 Subject: [PATCH 2/2] Fix int type --- bq_schema_generator/schema_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bq_schema_generator/schema_generator.py b/bq_schema_generator/schema_generator.py index 945359a..9de81b6 100644 --- a/bq_schema_generator/schema_generator.py +++ b/bq_schema_generator/schema_generator.py @@ -3,14 +3,14 @@ import logging AllowedTypes = Union[dict, bool, float, int, str, list] -AllowedBQDataType = Literal["BOOLEAN", "INT", "FLOAT", "STRING", "RECORD", "NULL"] +AllowedBQDataType = Literal["BOOLEAN", "INTEGER", "FLOAT", "STRING", "RECORD", "NULL"] BQMode = Literal["NULLABLE", "REQUIRED", "REPEATED"] INT_MODE_TYPE_TO_BQ_DATA_TYPE: dict[type, AllowedBQDataType] = { dict: "RECORD", bool: "BOOLEAN", float: "FLOAT", - int: "INT", + int: "INTEGER", str: "STRING", type(None): "NULL", } @@ -23,7 +23,7 @@ # i.e. right side can overwrite left side INT_MODE_TYPE_HIERARCHY: dict[AllowedBQDataType, list[AllowedBQDataType]] = { "BOOLEAN": ["STRING", "RECORD"], - "INT": ["FLOAT", "STRING", "RECORD"], + "INTEGER": ["FLOAT", "STRING", "RECORD"], "FLOAT": ["STRING", "RECORD"], "STRING": ["RECORD"], "RECORD": [],