man-group · vasil-pashov · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
@@ -491,6 +491,7 @@ set(arcticdb_srcs
         util/offset_string.cpp
         util/sparse_utils.cpp
         util/string_utils.cpp
+        util/timer.cpp
         util/trace.cpp
         util/type_handler.cpp
         version/local_versioned_engine.cpp

@@ -277,10 +277,15 @@ class Column {
         return TypedColumnIterator<TagType, const RawType>(*this, false);
     }
 
-    template<class T, std::enable_if_t<std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
+    template<typename T>
+    requires std::integral<T> || std::floating_point<T>
     void set_scalar(ssize_t row_offset, T val) {
-        util::check(sizeof(T) == get_type_size(type_.data_type()), "Type mismatch in set_scalar, expected {}",
-                    get_type_size(type_.data_type()));
+        util::check(
+            sizeof(T) == get_type_size(type_.data_type()),
+            "Type mismatch in set_scalar, expected {} byte scalar got {} byte scalar",
+            get_type_size(type_.data_type()),
+            sizeof(T)
+        );
 
         auto prev_logical_row = last_logical_row_;
         last_logical_row_ = row_offset;

@@ -74,13 +74,15 @@ class SegmentInMemory {
         impl_->end_row();
     }
 
-    template<class T, std::enable_if_t<std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
+    template<typename T>
+    requires std::integral<T> || std::floating_point<T>
     void set_scalar(position_t idx, T val) {
         impl_->set_scalar(idx, val);
     }
 
-    template<class T, std::enable_if_t<std::is_same_v<std::decay_t<T>, std::string>, int> = 0>
-    void set_scalar(position_t idx, T val) {
+    template<typename T>
+    requires std::same_as<std::decay_t<T>, std::string>
+    void set_scalar(position_t idx, const T& val) {
         impl_->set_string(idx, val);
     }
 

@@ -67,31 +67,31 @@ class SegmentInMemoryImpl {
 
         template<class Callable>
         auto visit(Callable &&c) const {
-            return entity::visit_field(parent_->descriptor().field(column_id_), [that=this, c=std::forward<Callable>(c)](auto type_desc_tag) {
+            return entity::visit_field(parent_->descriptor().field(column_id_), [this, c=std::forward<Callable>(c)](auto type_desc_tag) {
                 using RawType = typename std::decay_t<decltype(type_desc_tag)>::DataTypeTag::raw_type;
-                return c(that->parent_->scalar_at<RawType>(that->row_id_, that->column_id_));
+                return c(parent_->scalar_at<RawType>(row_id_, column_id_));
             });
         }
 
         template<class Callable>
         auto visit_string(Callable &&c) const {
-            return entity::visit_field(parent_->descriptor().field(column_id_), [that=this, c = std::forward<Callable>(c)](auto type_desc_tag) {
+            return entity::visit_field(parent_->descriptor().field(column_id_), [this, c = std::forward<Callable>(c)](auto type_desc_tag) {
                 using DTT = typename std::decay_t<decltype(type_desc_tag)>::DataTypeTag;
                 if constexpr(is_sequence_type(DTT::data_type))
-                    return c(that->parent_->string_at(that->row_id_, position_t(that->column_id_)));
+                    return c(parent_->string_at(row_id_, position_t(column_id_)));
             });
         }
 
         template<class Callable>
         auto visit_field(Callable &&c) const {
             const auto& field = parent_->descriptor().field(column_id_);
-            return entity::visit_field(parent_->descriptor().field(column_id_), [&field, that=this, c = std::forward<Callable>(c)](auto type_desc_tag) {
+            return entity::visit_field(field, [&field, this, c = std::forward<Callable>(c)](auto type_desc_tag) {
                 using DataTypeTag = typename std::decay_t<decltype(type_desc_tag)>::DataTypeTag;
                 using RawType = typename DataTypeTag::raw_type;
                 if constexpr (is_sequence_type(DataTypeTag::data_type))
-                    return c(that->parent_->string_at(that->row_id_, position_t(that->column_id_)), std::string_view{field.name()}, field.type());
+                    return c(parent_->string_at(row_id_, position_t(column_id_)), std::string_view{field.name()}, type_desc_tag);
                 else if constexpr (is_numeric_type(DataTypeTag::data_type) || is_bool_type(DataTypeTag::data_type))
-                    return c(that->parent_->scalar_at<RawType>(that->row_id_, that->column_id_), std::string_view{field.name()}, field.type());
+                    return c(parent_->scalar_at<RawType>(row_id_, column_id_), std::string_view{field.name()}, type_desc_tag);
                 else if constexpr(is_empty_type(DataTypeTag::data_type))
                     internal::raise<ErrorCode::E_ASSERTION_FAILURE>("visit_field does not support empty-type columns");
                 else
@@ -101,9 +101,9 @@ class SegmentInMemoryImpl {
 
         template<class Callable>
         auto visit(Callable &&c) {
-            return entity::visit_field(parent_->descriptor().field(column_id_), [that=this, c=std::forward<Callable>(c)](auto type_desc_tag) {
+            return entity::visit_field(parent_->descriptor().field(column_id_), [this, c=std::forward<Callable>(c)](auto type_desc_tag) {
                 using RawType = typename std::decay_t<decltype(type_desc_tag)>::DataTypeTag::raw_type;
-                return c(that->parent_->reference_at<RawType>(that->row_id_, that->column_id_));
+                return c(parent_->reference_at<RawType>(row_id_, column_id_));
             });
         }
 
@@ -454,18 +454,21 @@ class SegmentInMemoryImpl {
         });
     }
 
-    template<class T, std::enable_if_t<std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
-        void set_scalar(position_t idx, T val) {
+    template<class T>
+    requires std::integral<T> || std::floating_point<T>
+    void set_scalar(position_t idx, T val) {
         ARCTICDB_TRACE(log::version(), "Segment setting scalar {} at row {} column {}", val, row_id_ + 1, idx);
         column(idx).set_scalar(row_id_ + 1, val);
     }
 
-    template<class T, std::enable_if_t<std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
-        void set_external_block(position_t idx, T *val, size_t size) {
+    template<class T>
+    requires std::integral<T> || std::floating_point<T>
+    void set_external_block(position_t idx, T *val, size_t size) {
         column_unchecked(idx).set_external_block(row_id_ + 1, val, size);
     }
 
-    template<class T, std::enable_if_t<std::is_integral_v<T> || std::is_floating_point_v<T>, int> = 0>
+    template<class T>
+    requires std::integral<T> || std::floating_point<T>
     void set_sparse_block(position_t idx, T *val,  size_t rows_to_write) {
         column_unchecked(idx).set_sparse_block(row_id_ + 1, val, rows_to_write);
     }
@@ -478,23 +481,22 @@ class SegmentInMemoryImpl {
         column_unchecked(idx).set_sparse_block(std::move(buffer), std::move(shapes), std::move(bitset));
     }
 
-    template<class T, std::enable_if_t<std::is_same_v<std::decay_t<T>, std::string>, int> = 0>
-        void set_scalar(position_t idx, T val) {
+    template<class T>
+    requires std::same_as<std::decay_t<T>, std::string>
+    void set_scalar(position_t idx, const T& val) {
         set_string(idx, val);
     }
 
-    template<class T, template<class> class Tensor, std::enable_if_t<
-        std::is_integral_v<T> || std::is_floating_point_v<T>,
-        int> = 0>
-            void set_array(position_t pos, Tensor<T> &val) {
+    template<class T, template<class> class Tensor>
+    requires std::integral<T> || std::floating_point<T>
+    void set_array(position_t pos, Tensor<T> &val) {
         magic_.check();
         ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)
         column_unchecked(pos).set_array(row_id_ + 1, val);
     }
 
-    template<class T, std::enable_if_t<
-        std::is_integral_v<T> || std::is_floating_point_v<T>,
-        int> = 0>
+    template<class T>
+    requires std::integral<T> || std::floating_point<T>
     void set_array(position_t pos, py::array_t<T>& val) {
         magic_.check();
         ARCTICDB_SAMPLE(MemorySegmentSetArray, 0)

@@ -13,7 +13,7 @@
 namespace arcticdb {
 StreamDescriptor merge_descriptors(
     const StreamDescriptor &original,
-    const std::vector<std::shared_ptr<FieldCollection>> &entries,
+    std::span<const std::shared_ptr<FieldCollection>> entries,
     const std::unordered_set<std::string_view> &filtered_set,
     const std::optional<IndexDescriptorImpl>& default_index) {
     using namespace arcticdb::stream;
@@ -34,6 +34,7 @@ StreamDescriptor merge_descriptors(
                 merged_fields.emplace_back(idx.name());
                 merged_fields_map.try_emplace(idx.name(), TypeDescriptor{typename IndexType::TypeDescTag{}});
             });
+            index = default_index_type_from_descriptor(*default_index);
         } else {
             util::raise_rte("Descriptor has uninitialized index and no default supplied");
         }
@@ -71,7 +72,12 @@ StreamDescriptor merge_descriptors(
                         if(new_descriptor) {
                             merged_fields_map[field.name()] = *new_descriptor;
                         } else {
-                            util::raise_rte("No valid common type between {} and {} for column {}", existing_type_desc, type_desc, field.name());
+                            schema::raise<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+                                "No valid common type between {} and {} for column {}",
+                                existing_type_desc,
+                                type_desc,
+                                field.name()
+                            );
                         }
                     }
                 } else {
@@ -99,6 +105,17 @@ StreamDescriptor merge_descriptors(
     return merge_descriptors(original, entries, filtered_set, default_index);
 }
 
+StreamDescriptor merge_descriptors(
+    const StreamDescriptor& original,
+    std::span<const std::shared_ptr<FieldCollection>> entries,
+    const std::optional<std::vector<std::string>>& filtered_columns,
+    const std::optional<IndexDescriptorImpl>& default_index) {
+    std::unordered_set<std::string_view> filtered_set = filtered_columns.has_value()
+        ? std::unordered_set<std::string_view>(filtered_columns->begin(), filtered_columns->end())
+        : std::unordered_set<std::string_view>{};
+    return merge_descriptors(original, entries, filtered_set, default_index);
+}
+
 StreamDescriptor merge_descriptors(
     const StreamDescriptor &original,
     const std::vector<pipelines::SliceAndKey> &entries,

@@ -7,11 +7,12 @@
 
 #include <arcticdb/entity/stream_descriptor.hpp>
 #include <arcticdb/pipeline/frame_slice.hpp>
+#include <span>
 
 namespace arcticdb {
 StreamDescriptor merge_descriptors(
     const StreamDescriptor &original,
-    const std::vector<std::shared_ptr<FieldCollection>> &entries,
+    std::span<const std::shared_ptr<FieldCollection>> entries,
     const std::unordered_set<std::string_view> &filtered_set,
     const std::optional<IndexDescriptorImpl>& default_index);
 
@@ -21,6 +22,12 @@ entity::StreamDescriptor merge_descriptors(
     const std::optional<std::vector<std::string>> &filtered_columns,
     const std::optional<entity::IndexDescriptorImpl>& default_index = std::nullopt);
 
+entity::StreamDescriptor merge_descriptors(
+    const entity::StreamDescriptor& original,
+    std::span<const std::shared_ptr<FieldCollection>> entries,
+    const std::optional<std::vector<std::string>>& filtered_columns,
+    const std::optional<entity::IndexDescriptorImpl>& default_index = std::nullopt);
+
 entity::StreamDescriptor merge_descriptors(
     const entity::StreamDescriptor &original,
     const std::vector<pipelines::SliceAndKey> &entries,

@@ -441,11 +441,11 @@ struct TypeDescriptor {
     template<typename Callable>
     constexpr auto visit_tag(Callable &&callable) const;
 
-    bool operator==(const TypeDescriptor &o) const {
+    [[nodiscard]] constexpr bool operator==(const TypeDescriptor& o) const {
         return data_type_ == o.data_type_ && dimension_ == o.dimension_;
     }
 
-    bool operator!=(const TypeDescriptor &o) const {
+    [[nodiscard]] constexpr bool operator!=(const TypeDescriptor& o) const {
         return !(*this == o);
     }
 

@@ -92,9 +92,10 @@ TimeseriesDescriptor get_merged_tsd(
     }
     else if (dynamic_schema) {
         // In case of dynamic schema
+        const std::array fields_ptr = {new_frame->desc.fields_ptr()};
         merged_descriptor = merge_descriptors(
                 existing_descriptor,
-                std::vector<std::shared_ptr<FieldCollection>>{new_frame->desc.fields_ptr()},
+                fields_ptr,
                 {}
         );
     } else {

@@ -108,6 +108,9 @@ struct PipelineContext : public std::enable_shared_from_this<PipelineContext> {
     // written in, desc_ will be modified such that the return matches what's requested, and this'll be set to the
     // original value. It's only set in this edge case.
     std::optional<StreamDescriptor> orig_desc_;
+    // When there are staged segments this holds the combined stream descriptor for all staged segments
+    // This can be different than desc_ in case dynamic schema is used. Otherwise they must be the same.
+    std::optional<StreamDescriptor> staged_descriptor_;
     StreamId stream_id_;
     VersionId version_id_ = 0;
     size_t total_rows_ = 0;
@@ -200,6 +203,7 @@ struct PipelineContext : public std::enable_shared_from_this<PipelineContext> {
         swap(left.segment_descriptors_, right.segment_descriptors_);
         swap(left.filter_columns_set_, right.filter_columns_set_);
         swap(left.compacted_, right.compacted_);
+        swap(left.staged_descriptor_, right.staged_descriptor_);
     }
 
     using iterator = PipelineContextIterator<PipelineContextRow>;

@@ -601,7 +601,7 @@ struct ReduceColumnTask : async::BaseTask {
             } else {
                 column.default_initialize_rows(0, frame_.row_count(), false);
             }
-        } else {
+        } else if (column_data != slice_map_->columns_.end()) {
             if(dynamic_schema_) {
                 NullValueReducer null_reducer{column, context_, frame_, shared_data_, handler_data_};
                 for (const auto &row : column_data->second) {
@@ -623,6 +623,8 @@ struct ReduceColumnTask : async::BaseTask {
 
                 column.set_inflated(frame_.row_count());
             }
+        } else if (!dynamic_schema_ && column_data == slice_map_->columns_.end() && is_sequence_type(column.type().data_type())) {
+            internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Column with index {} is not in static schema slice map.", column_index_);
         }
         return folly::Unit{};
     }