From ffb6ef3bab85902a1fe96e2022eb2fafa94497bd Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 16:59:11 +0000 Subject: [PATCH 01/12] feat: add `where` to nplikes --- src/awkward/_nplikes/array_module.py | 3 +++ src/awkward/_nplikes/numpylike.py | 4 ++++ src/awkward/_nplikes/typetracer.py | 9 ++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/awkward/_nplikes/array_module.py b/src/awkward/_nplikes/array_module.py index 3572b2f1d9..143a0f655e 100644 --- a/src/awkward/_nplikes/array_module.py +++ b/src/awkward/_nplikes/array_module.py @@ -186,6 +186,9 @@ def regularize_index_for_length( def nonzero(self, x: ArrayLike) -> tuple[ArrayLike, ...]: return self._module.nonzero(x) + def where(self, condition: ArrayLike, x1: ArrayLike, x2: ArrayLike) -> ArrayLike: + return self._module.where(condition, x1, x2) + def unique_values(self, x: ArrayLike) -> ArrayLike: return self._module.unique( x, diff --git a/src/awkward/_nplikes/numpylike.py b/src/awkward/_nplikes/numpylike.py index 0d6178e623..4a2ca1313d 100644 --- a/src/awkward/_nplikes/numpylike.py +++ b/src/awkward/_nplikes/numpylike.py @@ -396,6 +396,10 @@ def reshape( def nonzero(self, x: ArrayLike) -> tuple[ArrayLike, ...]: ... + @abstractmethod + def where(self, condition: ArrayLike, x1: ArrayLike, x2: ArrayLike) -> ArrayLike: + ... + @abstractmethod def unique_values(self, x: ArrayLike) -> ArrayLike: ... diff --git a/src/awkward/_nplikes/typetracer.py b/src/awkward/_nplikes/typetracer.py index 190256b2c8..e6f46221f8 100644 --- a/src/awkward/_nplikes/typetracer.py +++ b/src/awkward/_nplikes/typetracer.py @@ -1031,9 +1031,16 @@ def nonzero(self, x: ArrayLike) -> tuple[TypeTracerArray, ...]: try_touch_data(x) return (TypeTracerArray._new(np.int64, (unknown_length,)),) * len(x.shape) + def where( + self, condition: ArrayLike, x1: ArrayLike, x2: ArrayLike + ) -> TypeTracerArray: + condition, x1, x2 = self.broadcast_arrays(condition, x1, x2) + result_dtype = numpy.result_type(x1, x2) + return TypeTracerArray._new(result_dtype, shape=condition.shape) + def unique_values(self, x: ArrayLike) -> TypeTracerArray: try_touch_data(x) - return TypeTracerArray._new(x.dtype, shape=(None,)) + return TypeTracerArray._new(x.dtype, shape=(unknown_length,)) def concat(self, arrays, *, axis: int | None = 0) -> TypeTracerArray: if axis is None: From a46ae701be3cff83264df774efb22a314b9411dd Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 16:59:25 +0000 Subject: [PATCH 02/12] fix: support option-of-record in union-of-record --- .../operations/ak_merge_union_of_records.py | 310 ++++++++++++------ 1 file changed, 202 insertions(+), 108 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 3694909496..06d242a467 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -2,7 +2,7 @@ import awkward as ak -from awkward._nplikes.numpylike import NumpyMetadata +from awkward._nplikes.numpylike import ArrayLike, NumpyMetadata np = NumpyMetadata.instance() cpu = ak._backends.NumpyBackend.instance() @@ -41,32 +41,103 @@ def _impl(array, axis, highlevel, behavior): behavior = ak._util.behavior_of(array, behavior=behavior) layout = ak.to_layout(array, allow_record=False) - def apply_displace_index(layout, backend, **kwargs): - if layout.is_record: - return layout - elif layout.is_option and layout.content.is_record: - raise ak._errors.wrap_error( - TypeError( - "optional records cannot be merged by this function. First call `ak.merge_option_of_records` " - "to convert these into records of options." + def invert_record_union( + tags: ArrayLike, index: ArrayLike, contents, *, backend + ) -> ak.contents.RecordArray: + index_nplike = backend.index_nplike + # First, find all ordered fields, regularising any index-of-record + # such that we have record-of-index + seen_fields = set() + all_fields = [] + for content in contents: + # Find new fields + for field in content.fields: + if field not in seen_fields: + seen_fields.add(field) + all_fields.append(field) + + # Build unions for each field + outer_field_contents = [] + for field in all_fields: + field_tags = index_nplike.asarray(tags, copy=True) + field_index = index_nplike.asarray(index, copy=True) + + # Build contents for union representing current field + field_contents = [c.content(field) for c in contents if c.has_field(field)] + + # Find the best location for option type. + # We will potentially have fewer contents in this per-field union + # than the original outer union-of-records, because some recordarrays + # may not have the given field. + tag_for_missing = 0 + for i, content in enumerate(field_contents): + if content.is_option: + tag_for_missing = i + break + + # If at least one recordarray doesn't have this field, we add + # a special option + if len(field_contents) < len(contents): + # Make the tagged content an option, growing by one to ensure we + # have a known `None` value to index into + tagged_content = field_contents[tag_for_missing] + indexedoption_index = backend.index_nplike.arange( + tagged_content.length + 1, dtype=np.int64 ) + indexedoption_index[ + index_nplike.shape_item_as_index(tagged_content.length) + ] = -1 + field_contents[ + tag_for_missing + ] = ak.contents.IndexedOptionArray.simplified( + ak.index.Index64(indexedoption_index), tagged_content + ) + + # Index of None values in tagged content (content with extra None item at end) + index_missing = index_nplike.shape_item_as_index( + field_contents[tag_for_missing].length - 1 ) - elif layout.is_indexed and layout.content.is_record: - record = layout.content - # Transpose index-of-record to record-of-index - return ak.contents.RecordArray( - [ - ak.contents.IndexedArray.simplified( - layout.index, c, parameters=layout._parameters - ) - for c in record.contents - ], - record.fields, - record.length, - backend=backend, + # Now build contents for union, by looping over outermost index + # Overwrite tags to adjust for new contents length + # and use the tagged content for any missing values + k = 0 + for j, content in enumerate(contents): + tag_is_j = field_tags == j + + if content.has_field(field): + # Rewrite tags to account for missing fields + field_tags[tag_is_j] = k + k += 1 + + else: + # Rewrite tags to point to option content + field_tags[tag_is_j] = tag_for_missing + # Point each value to missing value + field_index[tag_is_j] = index_missing + + outer_field_contents.append( + ak.contents.UnionArray.simplified( + ak.index.Index8(field_tags), + ak.index.Index64(field_index), + field_contents, + ) ) - else: - raise ak._errors.wrap_error(TypeError(layout)) + return ak.contents.RecordArray( + outer_field_contents, all_fields, backend=backend + ) + + def compact_option_index(index: ArrayLike, *, backend) -> ArrayLike: + # Find dense (outer) index into non-null items. + # This is in trivial order: the re-arranging is done by the union (below) + is_none = index < 0 + num_none = backend.index_nplike.count_nonzero(is_none) + dense_index = backend.index_nplike.empty(len(index), dtype=index.dtype) + dense_index[is_none] = -1 + dense_index[~is_none] = backend.index_nplike.arange( + len(index) - num_none, + dtype=index.dtype, + ) + return dense_index def apply(layout, depth, backend, **kwargs): posaxis = ak._util.maybe_posaxis(layout, axis, depth) @@ -75,93 +146,116 @@ def apply(layout, depth, backend, **kwargs): np.AxisError(f"axis={axis} exceeds the depth of this array ({depth})") ) elif depth == posaxis + 1 and layout.is_union: - if all(x.is_record for x in layout.contents): - # First, find all ordered fields, regularising any index-of-record - # such that we have record-of-index - seen_fields = set() - all_fields = [] - regularised_contents = [] - for content in layout.contents: - # Ensure that we have record-of-index - regularised_content = ak._do.recursively_apply( - content, apply_displace_index - ) - regularised_contents.append(regularised_content) - - # Find new fields - for field in regularised_content.fields: - if field not in seen_fields: - seen_fields.add(field) - all_fields.append(field) - - # Build unions for each field - outer_field_contents = [] - for field in all_fields: - field_tags = backend.index_nplike.asarray(layout.tags, copy=True) - field_index = backend.index_nplike.asarray(layout.index, copy=True) - - # Build contents for union representing current field - field_contents = [ - c.content(field) - for c in regularised_contents - if c.has_field(field) - ] - - # Find the best location for option type. - # We will potentially have fewer contents in this per-field union - # than the original outer union-of-records, because some recordarrays - # may not have the given field. - tag_for_missing = 0 - for i, content in enumerate(field_contents): - if content.is_option: - tag_for_missing = i - break - - # If at least one recordarray doesn't have this field, we add - # a special option - if len(field_contents) < len(regularised_contents): - # Make the tagged content an option, growing by one to ensure we - # have a known `None` value to index into - tagged_content = field_contents[tag_for_missing] - indexedoption_index = backend.index_nplike.arange( - tagged_content.length + 1, dtype=np.int64 - ) - indexedoption_index[tagged_content.length] = -1 - field_contents[ - tag_for_missing - ] = ak.contents.IndexedOptionArray.simplified( - ak.index.Index64(indexedoption_index), tagged_content + if not all( + x.is_record or x.is_indexed or x.is_option for x in layout.contents + ): + return + + # Any option types need to be re-written + if any(x.is_option for x in layout.contents): + # We'll create an outermost indexed-option type, which re-instates the missing values + outer_option_index = backend.index_nplike.empty( + layout.length, dtype=np.int64 + ) + + # We'll rebuild the union to include only the non-null items. + next_union_dense_index_parts = [] + next_contents = [] + next_tags_sparse = backend.index_nplike.asarray(layout.tags, copy=True) + for tag, content in enumerate(layout.contents): + is_this_tag = backend.index_nplike.asarray(layout.tags) == tag + + # Union arrays for this content + tag_index = backend.index_nplike.asarray(layout.index)[is_this_tag] + + # Optimisation + if isinstance(content, ak.contents.UnmaskedArray) or not ( + content.is_option or content.is_indexed + ): + next_contents.append(content) + next_union_dense_index_parts.append(tag_index) + + else: + content = content.to_IndexedOptionArray64() + + # Find dense (outer) index into non-null items. + outer_option_index[is_this_tag] = compact_option_index( + backend.index_nplike.asarray(content.index), backend=backend ) - # Now build contents for union, by looping over outermost index - # Overwrite tags to adjust for new contents length - # and use the tagged content for any missing values - k = 0 - for j, content in enumerate(regularised_contents): - tag_is_j = field_tags == j - - if content.has_field(field): - # Rewrite tags to account for missing fields - field_tags[tag_is_j] = k - k += 1 - - else: - # Rewrite tags to point to option content - field_tags[tag_is_j] = tag_for_missing - # Point each value to missing value - field_index[tag_is_j] = ( - field_contents[tag_for_missing].length - 1 - ) - - outer_field_contents.append( - ak.contents.UnionArray.simplified( - ak.index.Index8(field_tags), - ak.index.Index64(field_index), - field_contents, + # Now find the inner index that actually re-arranges the (non-null) items + merged_index = backend.index_nplike.asarray(content.index)[ + tag_index + ] + is_non_null = merged_index >= 0 + next_union_dense_index_parts.append(merged_index[is_non_null]) + # Mask out tags of items that are missing + next_tags_sparse[is_this_tag] = backend.index_nplike.where( + is_non_null, tag, -1 ) - ) - return ak.contents.RecordArray( - outer_field_contents, all_fields, backend=backend + + # outer_index has same length as layout, so union index should align + # union items that are null are set to -1, and those that are not are densified + next_contents.append(content.content) + + # Find length of new tags array + total_length = 0 + for array in next_union_dense_index_parts: + total_length += array.size + + # Ignore missing items for inner union + next_tags = next_tags_sparse[next_tags_sparse >= 0] + # Inject new dense index + next_index = backend.index_nplike.empty(total_length, dtype=np.int64) + for tag, index in enumerate(next_union_dense_index_parts): + next_index[next_tags == tag] = index + + next_content = invert_record_union( + next_tags, next_index, next_contents, backend=backend + ) + return ak.contents.IndexedOptionArray( + ak.index.Index64(outer_option_index), + next_content, + ) + + # Any index types need to be re-written + elif any(x.is_indexed for x in layout.contents): + # We'll create an outermost indexed-option type, which re-instates the missing values + current_index = backend.index_nplike.asarray(layout.index) + next_index = backend.index_nplike.empty( + current_index.size, dtype=np.int64 + ) + + # We'll rebuild the union to include only the non-null items. + next_contents = [] + for tag, content in enumerate(layout.contents): + is_this_tag = backend.index_nplike.asarray(layout.tags) == tag + + # Rewrite union index of indexed types + if content.is_indexed: + content_index = backend.index_nplike.asarray(content.index) + next_index[is_this_tag] = current_index[is_this_tag][ + content_index + ] + next_contents.append(content.content) + + else: + next_index[is_this_tag] = current_index[is_this_tag] + next_contents.append(content) + + return invert_record_union( + backend.index_nplike.asarray(layout.tags), + next_index, + next_contents, + backend=backend, + ) + + else: + return invert_record_union( + backend.index_nplike.asarray(layout.tags), + backend.index_nplike.asarray(layout.index), + layout.contents, + backend=backend, ) out = ak._do.recursively_apply(layout, apply) From c891d1ec3b90282b9b37d31456ecff6e24f2538a Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 17:01:36 +0000 Subject: [PATCH 03/12] fix: handle index in option branch --- src/awkward/operations/ak_merge_union_of_records.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 06d242a467..36755b2950 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -176,7 +176,8 @@ def apply(layout, depth, backend, **kwargs): next_union_dense_index_parts.append(tag_index) else: - content = content.to_IndexedOptionArray64() + if content.is_option: + content = content.to_IndexedOptionArray64() # Find dense (outer) index into non-null items. outer_option_index[is_this_tag] = compact_option_index( From fa4cd6588897a6c65835d3ed9fea65553963681b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 17:11:21 +0000 Subject: [PATCH 04/12] fix: handle index in option branch --- .../operations/ak_merge_union_of_records.py | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 36755b2950..ecff5a66a0 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -45,8 +45,7 @@ def invert_record_union( tags: ArrayLike, index: ArrayLike, contents, *, backend ) -> ak.contents.RecordArray: index_nplike = backend.index_nplike - # First, find all ordered fields, regularising any index-of-record - # such that we have record-of-index + # First, create an ordered list containing the union of all fields seen_fields = set() all_fields = [] for content in contents: @@ -159,7 +158,7 @@ def apply(layout, depth, backend, **kwargs): ) # We'll rebuild the union to include only the non-null items. - next_union_dense_index_parts = [] + inner_union_index_parts = [] next_contents = [] next_tags_sparse = backend.index_nplike.asarray(layout.tags, copy=True) for tag, content in enumerate(layout.contents): @@ -168,28 +167,28 @@ def apply(layout, depth, backend, **kwargs): # Union arrays for this content tag_index = backend.index_nplike.asarray(layout.index)[is_this_tag] - # Optimisation + # For trivial partitions, we just include them as-is if isinstance(content, ak.contents.UnmaskedArray) or not ( content.is_option or content.is_indexed ): next_contents.append(content) - next_union_dense_index_parts.append(tag_index) + inner_union_index_parts.append(tag_index) else: + # Let's work with indexed option types for ease if content.is_option: content = content.to_IndexedOptionArray64() - # Find dense (outer) index into non-null items. + # Find dense index into non-null items of this content for the outer optiontype. + content_index = backend.index_nplike.asarray(content.index) outer_option_index[is_this_tag] = compact_option_index( - backend.index_nplike.asarray(content.index), backend=backend + content_index, backend=backend ) # Now find the inner index that actually re-arranges the (non-null) items - merged_index = backend.index_nplike.asarray(content.index)[ - tag_index - ] + merged_index = content_index[tag_index] is_non_null = merged_index >= 0 - next_union_dense_index_parts.append(merged_index[is_non_null]) + inner_union_index_parts.append(merged_index[is_non_null]) # Mask out tags of items that are missing next_tags_sparse[is_this_tag] = backend.index_nplike.where( is_non_null, tag, -1 @@ -199,24 +198,24 @@ def apply(layout, depth, backend, **kwargs): # union items that are null are set to -1, and those that are not are densified next_contents.append(content.content) - # Find length of new tags array + # Find length of the new (dense) tags array of the inner union total_length = 0 - for array in next_union_dense_index_parts: + for array in inner_union_index_parts: total_length += array.size - # Ignore missing items for inner union + # Ignore missing items for inner union, creating a dense array of tags next_tags = next_tags_sparse[next_tags_sparse >= 0] - # Inject new dense index + # Build dense index from parts for each tag next_index = backend.index_nplike.empty(total_length, dtype=np.int64) - for tag, index in enumerate(next_union_dense_index_parts): - next_index[next_tags == tag] = index + for tag, content_index in enumerate(inner_union_index_parts): + next_index[next_tags == tag] = content_index - next_content = invert_record_union( - next_tags, next_index, next_contents, backend=backend - ) + # Return option around record of unions return ak.contents.IndexedOptionArray( ak.index.Index64(outer_option_index), - next_content, + invert_record_union( + next_tags, next_index, next_contents, backend=backend + ), ) # Any index types need to be re-written From f4215a2c3ee9077fa219c004a3ace94849411b7b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 17:46:12 +0000 Subject: [PATCH 05/12] fix: bugs ;) --- .../operations/ak_merge_union_of_records.py | 30 ++-- ...test_2236_merge_union_of_records_option.py | 136 ++++++++++++++++++ 2 files changed, 153 insertions(+), 13 deletions(-) create mode 100644 tests/test_2236_merge_union_of_records_option.py diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index ecff5a66a0..453f70164f 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -153,7 +153,7 @@ def apply(layout, depth, backend, **kwargs): # Any option types need to be re-written if any(x.is_option for x in layout.contents): # We'll create an outermost indexed-option type, which re-instates the missing values - outer_option_index = backend.index_nplike.empty( + outer_option_index = backend.index_nplike.arange( layout.length, dtype=np.int64 ) @@ -168,24 +168,16 @@ def apply(layout, depth, backend, **kwargs): tag_index = backend.index_nplike.asarray(layout.index)[is_this_tag] # For trivial partitions, we just include them as-is - if isinstance(content, ak.contents.UnmaskedArray) or not ( - content.is_option or content.is_indexed - ): - next_contents.append(content) + if isinstance(content, ak.contents.UnmaskedArray): + next_contents.append(content.content) inner_union_index_parts.append(tag_index) - - else: + elif content.is_option or content.is_indexed: # Let's work with indexed option types for ease if content.is_option: content = content.to_IndexedOptionArray64() - # Find dense index into non-null items of this content for the outer optiontype. - content_index = backend.index_nplike.asarray(content.index) - outer_option_index[is_this_tag] = compact_option_index( - content_index, backend=backend - ) - # Now find the inner index that actually re-arranges the (non-null) items + content_index = backend.index_nplike.asarray(content.index) merged_index = content_index[tag_index] is_non_null = merged_index >= 0 inner_union_index_parts.append(merged_index[is_non_null]) @@ -194,9 +186,21 @@ def apply(layout, depth, backend, **kwargs): is_non_null, tag, -1 ) + # Find dense index into non-null items of this content for the outer optiontype. + outer_option_index[is_this_tag] = backend.index_nplike.where( + is_non_null, outer_option_index[is_this_tag], -1 + ) + # outer_index has same length as layout, so union index should align # union items that are null are set to -1, and those that are not are densified next_contents.append(content.content) + else: + next_contents.append(content) + inner_union_index_parts.append(tag_index) + + outer_option_index = compact_option_index( + outer_option_index, backend=backend + ) # Find length of the new (dense) tags array of the inner union total_length = 0 diff --git a/tests/test_2236_merge_union_of_records_option.py b/tests/test_2236_merge_union_of_records_option.py new file mode 100644 index 0000000000..d7d068351a --- /dev/null +++ b/tests/test_2236_merge_union_of_records_option.py @@ -0,0 +1,136 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import numpy as np # noqa: F401 + +import awkward as ak + + +def test_indexed(): + x = ak.to_layout([{"a": 1, "b": 2}]) + y = ak.contents.IndexedArray( + ak.index.Index64([0]), ak.to_layout([{"c": 3, "b": 5}]) + ) + + z = ak.concatenate((x, y)) + + assert z.tolist() == [ + {"a": 1, "b": 2}, + {"c": 3, "b": 5}, + ] + assert z.type == ak.types.ArrayType( + ak.types.UnionType( + [ + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["a", "b"], + ), + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["c", "b"], + ), + ] + ), + 2, + ) + + w = ak.merge_union_of_records(z) + assert w.type == ak.types.ArrayType( + ak.types.RecordType( + [ + ak.types.OptionType(ak.types.NumpyType("int64")), + ak.types.NumpyType("int64"), + ak.types.OptionType(ak.types.NumpyType("int64")), + ], + ["a", "b", "c"], + ), + 2, + ) + assert w.tolist() == [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 3}] + + +def test_option(): + x = ak.to_layout([{"a": 1, "b": 2}]) + y = ak.to_layout([{"c": 3, "b": 5}, None]) + + z = ak.concatenate((x, y)) + + assert z.tolist() == [{"a": 1, "b": 2}, {"c": 3, "b": 5}, None] + assert z.type == ak.types.ArrayType( + ak.types.UnionType( + [ + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["a", "b"], + ), + ak.types.OptionType( + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["c", "b"], + ) + ), + ] + ), + 3, + ) + + w = ak.merge_union_of_records(z) + assert w.type == ak.types.ArrayType( + ak.types.OptionType( + ak.types.RecordType( + [ + ak.types.OptionType(ak.types.NumpyType("int64")), + ak.types.NumpyType("int64"), + ak.types.OptionType(ak.types.NumpyType("int64")), + ], + ["a", "b", "c"], + ) + ), + 3, + ) + assert w.tolist() == [ + {"a": 1, "b": 2, "c": None}, + {"a": None, "b": 5, "c": 3}, + None, + ] + + +def test_option_unmasked(): + x = ak.to_layout([{"a": 1, "b": 2}]) + y = ak.contents.UnmaskedArray(ak.to_layout([{"c": 3, "b": 5}])) + + z = ak.concatenate((x, y)) + + assert z.tolist() == [{"a": 1, "b": 2}, {"c": 3, "b": 5}] + assert z.type == ak.types.ArrayType( + ak.types.UnionType( + [ + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["a", "b"], + ), + ak.types.OptionType( + ak.types.RecordType( + [ak.types.NumpyType("int64"), ak.types.NumpyType("int64")], + ["c", "b"], + ) + ), + ] + ), + 2, + ) + + w = ak.merge_union_of_records(z) + assert w.type == ak.types.ArrayType( + ak.types.OptionType( + ak.types.RecordType( + [ + ak.types.OptionType(ak.types.NumpyType("int64")), + ak.types.NumpyType("int64"), + ak.types.OptionType(ak.types.NumpyType("int64")), + ], + ["a", "b", "c"], + ) + ), + 2, + ) + assert w.tolist() == [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 3}] From a46e0d43cb2f2ec23cd209a6c117f6610e19e133 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 17:53:03 +0000 Subject: [PATCH 06/12] refactor: simplify logic --- .../operations/ak_merge_union_of_records.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 453f70164f..cb05187733 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -153,7 +153,9 @@ def apply(layout, depth, backend, **kwargs): # Any option types need to be re-written if any(x.is_option for x in layout.contents): # We'll create an outermost indexed-option type, which re-instates the missing values - outer_option_index = backend.index_nplike.arange( + # The values of this don't matter; we'll compute them as a dense index (0 ... N) + # where N is the number of non-null items + outer_option_dense_index = backend.index_nplike.empty( layout.length, dtype=np.int64 ) @@ -167,10 +169,11 @@ def apply(layout, depth, backend, **kwargs): # Union arrays for this content tag_index = backend.index_nplike.asarray(layout.index)[is_this_tag] - # For trivial partitions, we just include them as-is + # For unmasked arrays, we can directly take the content if isinstance(content, ak.contents.UnmaskedArray): next_contents.append(content.content) inner_union_index_parts.append(tag_index) + # Otherwise, we need to rebuild the index elif content.is_option or content.is_indexed: # Let's work with indexed option types for ease if content.is_option: @@ -187,19 +190,21 @@ def apply(layout, depth, backend, **kwargs): ) # Find dense index into non-null items of this content for the outer optiontype. - outer_option_index[is_this_tag] = backend.index_nplike.where( - is_non_null, outer_option_index[is_this_tag], -1 - ) + outer_option_dense_index[ + is_this_tag + ] = backend.index_nplike.where(is_non_null, 0, -1) # outer_index has same length as layout, so union index should align # union items that are null are set to -1, and those that are not are densified next_contents.append(content.content) + # Non-indexed/option types are trivially included as-is else: next_contents.append(content) inner_union_index_parts.append(tag_index) - outer_option_index = compact_option_index( - outer_option_index, backend=backend + # Drop the options from the outer most index + outer_option_dense_index = compact_option_index( + outer_option_dense_index, backend=backend ) # Find length of the new (dense) tags array of the inner union @@ -216,7 +221,7 @@ def apply(layout, depth, backend, **kwargs): # Return option around record of unions return ak.contents.IndexedOptionArray( - ak.index.Index64(outer_option_index), + ak.index.Index64(outer_option_dense_index), invert_record_union( next_tags, next_index, next_contents, backend=backend ), From 545cfcafc6491b3734ce10d489181d9ae27e148c Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 17:54:46 +0000 Subject: [PATCH 07/12] docs: more comments --- src/awkward/operations/ak_merge_union_of_records.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index cb05187733..ce6cce61b3 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -154,7 +154,8 @@ def apply(layout, depth, backend, **kwargs): if any(x.is_option for x in layout.contents): # We'll create an outermost indexed-option type, which re-instates the missing values # The values of this don't matter; we'll compute them as a dense index (0 ... N) - # where N is the number of non-null items + # where N is the number of non-null items. This indexes into the record-of-unions, which + # already handles the item ordering outer_option_dense_index = backend.index_nplike.empty( layout.length, dtype=np.int64 ) @@ -202,7 +203,8 @@ def apply(layout, depth, backend, **kwargs): next_contents.append(content) inner_union_index_parts.append(tag_index) - # Drop the options from the outer most index + # Until this point, `outer_option_dense_index` contains either zero or -1. + # We now compute a proper index starting at 0. outer_option_dense_index = compact_option_index( outer_option_dense_index, backend=backend ) From a1884f904964ccffb435d5642277c31882b02ea0 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 18:03:43 +0000 Subject: [PATCH 08/12] docs: more comments! --- .../operations/ak_merge_union_of_records.py | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index ce6cce61b3..397872699c 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -152,14 +152,6 @@ def apply(layout, depth, backend, **kwargs): # Any option types need to be re-written if any(x.is_option for x in layout.contents): - # We'll create an outermost indexed-option type, which re-instates the missing values - # The values of this don't matter; we'll compute them as a dense index (0 ... N) - # where N is the number of non-null items. This indexes into the record-of-unions, which - # already handles the item ordering - outer_option_dense_index = backend.index_nplike.empty( - layout.length, dtype=np.int64 - ) - # We'll rebuild the union to include only the non-null items. inner_union_index_parts = [] next_contents = [] @@ -180,7 +172,7 @@ def apply(layout, depth, backend, **kwargs): if content.is_option: content = content.to_IndexedOptionArray64() - # Now find the inner index that actually re-arranges the (non-null) items + # First, find the inner index that actually re-arranges the (non-null) items content_index = backend.index_nplike.asarray(content.index) merged_index = content_index[tag_index] is_non_null = merged_index >= 0 @@ -190,23 +182,19 @@ def apply(layout, depth, backend, **kwargs): is_non_null, tag, -1 ) - # Find dense index into non-null items of this content for the outer optiontype. - outer_option_dense_index[ - is_this_tag - ] = backend.index_nplike.where(is_non_null, 0, -1) - - # outer_index has same length as layout, so union index should align - # union items that are null are set to -1, and those that are not are densified + # The length of this index/option content is irrelevant; the union provides the length next_contents.append(content.content) # Non-indexed/option types are trivially included as-is else: next_contents.append(content) inner_union_index_parts.append(tag_index) - # Until this point, `outer_option_dense_index` contains either zero or -1. - # We now compute a proper index starting at 0. + # We'll create an outermost indexed-option type, which re-instates the missing values. + # This should have the same length as the original union, and its index should be "dense" + # (contiguous, monotonic integers; or -1). Therefore, we can directly compute it from the "sparse" + # tags index, which has the same length as the original union, and has only missing items set to -1. outer_option_dense_index = compact_option_index( - outer_option_dense_index, backend=backend + next_tags_sparse, backend=backend ) # Find length of the new (dense) tags array of the inner union From 8ff8548b11d018b9bc0d873c5d89583ae07607b5 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 13 Feb 2023 18:07:05 +0000 Subject: [PATCH 09/12] fix: use of `len(index)` --- src/awkward/operations/ak_merge_union_of_records.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 397872699c..159190cfdd 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -130,10 +130,10 @@ def compact_option_index(index: ArrayLike, *, backend) -> ArrayLike: # This is in trivial order: the re-arranging is done by the union (below) is_none = index < 0 num_none = backend.index_nplike.count_nonzero(is_none) - dense_index = backend.index_nplike.empty(len(index), dtype=index.dtype) + dense_index = backend.index_nplike.empty(index.size, dtype=index.dtype) dense_index[is_none] = -1 dense_index[~is_none] = backend.index_nplike.arange( - len(index) - num_none, + index.size - num_none, dtype=index.dtype, ) return dense_index From d2e22d965cb3ce25ad6409e2b012f31493e78a02 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 14 Feb 2023 08:29:42 +0000 Subject: [PATCH 10/12] refactor: drop need to compute length --- src/awkward/operations/ak_merge_union_of_records.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 159190cfdd..b03a4e573f 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -197,15 +197,10 @@ def apply(layout, depth, backend, **kwargs): next_tags_sparse, backend=backend ) - # Find length of the new (dense) tags array of the inner union - total_length = 0 - for array in inner_union_index_parts: - total_length += array.size - # Ignore missing items for inner union, creating a dense array of tags next_tags = next_tags_sparse[next_tags_sparse >= 0] # Build dense index from parts for each tag - next_index = backend.index_nplike.empty(total_length, dtype=np.int64) + next_index = backend.index_nplike.empty(next_tags.size, dtype=np.int64) for tag, content_index in enumerate(inner_union_index_parts): next_index[next_tags == tag] = content_index From 9cd2fdfa3e103e2ed48d8af98ab6a66280349def Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 14 Feb 2023 08:31:07 +0000 Subject: [PATCH 11/12] fix: invert indexing order --- src/awkward/operations/ak_merge_union_of_records.py | 4 ++-- tests/test_2236_merge_union_of_records_option.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index b03a4e573f..5f8fd663f5 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -228,8 +228,8 @@ def apply(layout, depth, backend, **kwargs): # Rewrite union index of indexed types if content.is_indexed: content_index = backend.index_nplike.asarray(content.index) - next_index[is_this_tag] = current_index[is_this_tag][ - content_index + next_index[is_this_tag] = content_index[ + current_index[is_this_tag] ] next_contents.append(content.content) diff --git a/tests/test_2236_merge_union_of_records_option.py b/tests/test_2236_merge_union_of_records_option.py index d7d068351a..3ae0d34fac 100644 --- a/tests/test_2236_merge_union_of_records_option.py +++ b/tests/test_2236_merge_union_of_records_option.py @@ -8,7 +8,7 @@ def test_indexed(): x = ak.to_layout([{"a": 1, "b": 2}]) y = ak.contents.IndexedArray( - ak.index.Index64([0]), ak.to_layout([{"c": 3, "b": 5}]) + ak.index.Index64([1]), ak.to_layout([{"c": 13, "b": 15}, {"c": 3, "b": 5}]) ) z = ak.concatenate((x, y)) From f384f765324627ab9d5b6a171e6f10dd496cdd1b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 14 Feb 2023 08:35:38 +0000 Subject: [PATCH 12/12] doc: describe behaviour of options --- src/awkward/operations/ak_merge_union_of_records.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/awkward/operations/ak_merge_union_of_records.py b/src/awkward/operations/ak_merge_union_of_records.py index 5f8fd663f5..a4c1673fbc 100644 --- a/src/awkward/operations/ak_merge_union_of_records.py +++ b/src/awkward/operations/ak_merge_union_of_records.py @@ -24,11 +24,21 @@ def merge_union_of_records(array, axis=-1, *, highlevel=True, behavior=None): Simplifies unions of records, e.g. >>> array = ak.concatenate(([{"a": 1}], [{"b": 2}])) + >>> array + into records of options, i.e. >>> ak.merge_union_of_records(array) + + Missing records are preserved in the result, e.g. + + >>> array = ak.concatenate(([{"a": 1}], [{"b": 2}, None])) + >>> array + + >>> ak.merge_union_of_records(array) + """ with ak._errors.OperationErrorContext( "ak.merge_union_of_records",