-
Notifications
You must be signed in to change notification settings - Fork 173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix resource level max_table_nesting and normalizer performance tuning #2026
Changes from 2 commits
179fcb1
23582ce
2ad9591
4606b47
a54a171
8246012
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,7 +96,7 @@ def _reset(self) -> None: | |
# self.primary_keys = Dict[str, ] | ||
|
||
def _flatten( | ||
self, table: str, dict_row: DictStrAny, _r_lvl: int | ||
self, table: str, dict_row: DictStrAny, parent_path: Tuple[str, ...], _r_lvl: int | ||
) -> Tuple[DictStrAny, Dict[Tuple[str, ...], Sequence[Any]]]: | ||
out_rec_row: DictStrAny = {} | ||
out_rec_list: Dict[Tuple[str, ...], Sequence[Any]] = {} | ||
|
@@ -117,7 +117,7 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - | |
# for lists and dicts we must check if type is possibly nested | ||
if isinstance(v, (dict, list)): | ||
if not self._is_nested_type( | ||
self.schema, table, nested_name, self.max_nesting, __r_lvl | ||
self.schema, table, nested_name, self.max_nesting, parent_path, __r_lvl | ||
): | ||
# TODO: if schema contains table {table}__{nested_name} then convert v into single element list | ||
if isinstance(v, dict): | ||
|
@@ -268,7 +268,7 @@ def _normalize_row( | |
schema = self.schema | ||
table = schema.naming.shorten_fragments(*parent_path, *ident_path) | ||
# flatten current row and extract all lists to recur into | ||
flattened_row, lists = self._flatten(table, dict_row, _r_lvl) | ||
flattened_row, lists = self._flatten(table, dict_row, parent_path, _r_lvl) | ||
# always extend row | ||
DataItemNormalizer._extend_row(extend, flattened_row) | ||
# infer record hash or leave existing primary key if present | ||
|
@@ -423,10 +423,25 @@ def _normalize_prop( | |
) | ||
|
||
@staticmethod | ||
def _get_table_nesting_level(schema: Schema, table_name: str) -> Optional[int]: | ||
def _get_table_nesting_level( | ||
schema: Schema, table_name: str, parent_path: Tuple[str, ...] | ||
) -> Optional[int]: | ||
"""gets table nesting level, will inherit from parent if not set""" | ||
|
||
# try go get table directly | ||
table = schema.tables.get(table_name) | ||
max_nesting = None | ||
|
||
if table and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))): | ||
return max_nesting | ||
|
||
# if table is not found, try to get it from root path | ||
if max_nesting is None and parent_path: | ||
table = schema.tables.get(parent_path[0]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we check intermediary tables here too? so if parent path is ["one", "two"] should we also check table "one__two" or is it always correct to use the top table? I'm not sure there is a case where there would be settings in a intermediary table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need only parent table. the overall fix is good but maybe we could optimize this block a little? it is always top level table that has this setting so maybe we should pass root table as table_name here? then no changes are needed |
||
|
||
if table: | ||
return table.get("x-normalizer", {}).get("max_nesting") # type: ignore | ||
return cast(int, table.get("x-normalizer", {}).get("max_nesting")) | ||
|
||
return None | ||
|
||
@staticmethod | ||
|
@@ -440,13 +455,20 @@ def _get_primary_key(schema: Schema, table_name: str) -> List[str]: | |
@staticmethod | ||
@lru_cache(maxsize=None) | ||
def _is_nested_type( | ||
schema: Schema, table_name: str, field_name: str, max_nesting: int, _r_lvl: int | ||
schema: Schema, | ||
table_name: str, | ||
field_name: str, | ||
max_nesting: int, | ||
parent_path: Tuple[str, ...], | ||
_r_lvl: int, | ||
) -> bool: | ||
"""For those paths the nested objects should be left in place. | ||
Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster | ||
""" | ||
# turn everything at the recursion level into nested type | ||
max_table_nesting = DataItemNormalizer._get_table_nesting_level(schema, table_name) | ||
max_table_nesting = DataItemNormalizer._get_table_nesting_level( | ||
schema, table_name, parent_path | ||
) | ||
if max_table_nesting is not None: | ||
max_nesting = max_table_nesting | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is probably the most often called function in our code. maybe we can skip passing additional argument here or do something else to make it faster?