Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix resource level max_table_nesting and normalizer performance tuning #2026

Merged
merged 6 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions dlt/common/normalizers/json/relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _reset(self) -> None:
# self.primary_keys = Dict[str, ]

def _flatten(
self, table: str, dict_row: DictStrAny, _r_lvl: int
self, table: str, dict_row: DictStrAny, parent_path: Tuple[str, ...], _r_lvl: int
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is probably the most often called function in our code. maybe we can skip passing additional argument here or do something else to make it faster?

  • count the recursion level down (r_lvl) with the initial value initialized to max nesting per table. so we do not unnest when below _r_lvl
  • pass just parent table name, should be faster than tuple

) -> Tuple[DictStrAny, Dict[Tuple[str, ...], Sequence[Any]]]:
out_rec_row: DictStrAny = {}
out_rec_list: Dict[Tuple[str, ...], Sequence[Any]] = {}
Expand All @@ -117,7 +117,7 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -
# for lists and dicts we must check if type is possibly nested
if isinstance(v, (dict, list)):
if not self._is_nested_type(
self.schema, table, nested_name, self.max_nesting, __r_lvl
self.schema, table, nested_name, self.max_nesting, parent_path, __r_lvl
):
# TODO: if schema contains table {table}__{nested_name} then convert v into single element list
if isinstance(v, dict):
Expand Down Expand Up @@ -268,7 +268,7 @@ def _normalize_row(
schema = self.schema
table = schema.naming.shorten_fragments(*parent_path, *ident_path)
# flatten current row and extract all lists to recur into
flattened_row, lists = self._flatten(table, dict_row, _r_lvl)
flattened_row, lists = self._flatten(table, dict_row, parent_path, _r_lvl)
# always extend row
DataItemNormalizer._extend_row(extend, flattened_row)
# infer record hash or leave existing primary key if present
Expand Down Expand Up @@ -423,10 +423,25 @@ def _normalize_prop(
)

@staticmethod
def _get_table_nesting_level(schema: Schema, table_name: str) -> Optional[int]:
def _get_table_nesting_level(
schema: Schema, table_name: str, parent_path: Tuple[str, ...]
) -> Optional[int]:
"""gets table nesting level, will inherit from parent if not set"""

# try go get table directly
table = schema.tables.get(table_name)
max_nesting = None

if table and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))):
return max_nesting

# if table is not found, try to get it from root path
if max_nesting is None and parent_path:
table = schema.tables.get(parent_path[0])
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we check intermediary tables here too? so if parent path is ["one", "two"] should we also check table "one__two" or is it always correct to use the top table? I'm not sure there is a case where there would be settings in a intermediary table.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need only parent table. the overall fix is good but maybe we could optimize this block a little? it is always top level table that has this setting so maybe we should pass root table as table_name here? then no changes are needed


if table:
return table.get("x-normalizer", {}).get("max_nesting") # type: ignore
return cast(int, table.get("x-normalizer", {}).get("max_nesting"))

return None

@staticmethod
Expand All @@ -440,13 +455,20 @@ def _get_primary_key(schema: Schema, table_name: str) -> List[str]:
@staticmethod
@lru_cache(maxsize=None)
def _is_nested_type(
schema: Schema, table_name: str, field_name: str, max_nesting: int, _r_lvl: int
schema: Schema,
table_name: str,
field_name: str,
max_nesting: int,
parent_path: Tuple[str, ...],
_r_lvl: int,
) -> bool:
"""For those paths the nested objects should be left in place.
Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster
"""
# turn everything at the recursion level into nested type
max_table_nesting = DataItemNormalizer._get_table_nesting_level(schema, table_name)
max_table_nesting = DataItemNormalizer._get_table_nesting_level(
schema, table_name, parent_path
)
if max_table_nesting is not None:
max_nesting = max_table_nesting

Expand Down
12 changes: 6 additions & 6 deletions tests/common/normalizers/test_json_relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None:
"f 2": [],
"f!3": {"f4": "a", "f-5": "b", "f*6": {"c": 7, "c v": 8, "c x": []}},
}
flattened_row, lists = norm._flatten("mock_table", row, 0)
flattened_row, lists = norm._flatten("mock_table", row, (), 0)
assert "f_1" in flattened_row
# assert "f_2" in flattened_row
assert "f_3__f4" in flattened_row
Expand Down Expand Up @@ -62,11 +62,11 @@ def test_preserve_json_value(norm: RelationalNormalizer) -> None:
)
)
row_1 = {"value": 1}
flattened_row, _ = norm._flatten("with_json", row_1, 0)
flattened_row, _ = norm._flatten("with_json", row_1, (), 0)
assert flattened_row["value"] == 1

row_2 = {"value": {"json": True}}
flattened_row, _ = norm._flatten("with_json", row_2, 0)
flattened_row, _ = norm._flatten("with_json", row_2, (), 0)
assert flattened_row["value"] == row_2["value"]
# json value is not flattened
assert "value__json" not in flattened_row
Expand All @@ -78,11 +78,11 @@ def test_preserve_json_value_with_hint(norm: RelationalNormalizer) -> None:
norm.schema._compile_settings()

row_1 = {"value": 1}
flattened_row, _ = norm._flatten("any_table", row_1, 0)
flattened_row, _ = norm._flatten("any_table", row_1, (), 0)
assert flattened_row["value"] == 1

row_2 = {"value": {"json": True}}
flattened_row, _ = norm._flatten("any_table", row_2, 0)
flattened_row, _ = norm._flatten("any_table", row_2, (), 0)
assert flattened_row["value"] == row_2["value"]
# json value is not flattened
assert "value__json" not in flattened_row
Expand Down Expand Up @@ -884,7 +884,7 @@ def test_caching_perf(norm: RelationalNormalizer) -> None:
table["x-normalizer"] = {}
start = time()
for _ in range(100000):
norm._is_nested_type(norm.schema, "test", "field", 0, 0)
norm._is_nested_type(norm.schema, "test", "field", 0, (), 0)
# norm._get_table_nesting_level(norm.schema, "test")
print(f"{time() - start}")

Expand Down
Loading
Loading