halide · steven-johnson · Aug 1, 2023 · Jul 25, 2023 · Jul 26, 2023 · Jul 31, 2023
diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.cpp b/src/autoschedulers/anderson2021/GPULoopInfo.cpp
@@ -88,18 +88,18 @@ int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const Lo
     return extents;
 }
 
-std::unique_ptr<ThreadInfo> GPULoopInfo::create_thread_info() {
+const ThreadInfo *GPULoopInfo::create_thread_info() {
     internal_assert(at_or_inside_block());
     internal_assert(at_or_inside_thread());
+    internal_assert(thread_info == nullptr) << "create_thread_info() should not be called twice";
 
     auto max_thread_counts = current_block_loop->get_union_thread_counts(nullptr);
-    std::unique_ptr<ThreadInfo> new_thread_info = std::make_unique<ThreadInfo>(
+    thread_info = std::make_shared<const ThreadInfo>(
         current_thread_loop->vectorized_loop_index,
         current_thread_loop->size,
         current_thread_loop->stage->loop,
         max_thread_counts);
-    thread_info = new_thread_info.get();
-    return new_thread_info;
+    return thread_info.get();
 }
 
 }  // namespace Autoscheduler

diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.h b/src/autoschedulers/anderson2021/GPULoopInfo.h
@@ -7,6 +7,7 @@
  * hierarchy of blocks, threads, etc. Useful when computing GPU features
  */
 
+#include <memory>
 #include <vector>
 
 #include "Halide.h"
@@ -30,7 +31,6 @@ struct GPULoopInfo {
     int64_t num_blocks = 1;
     int64_t total_outer_serial_extents = 1;
     int64_t total_inner_serial_extents = 1;
-    const ThreadInfo *thread_info = nullptr;
 
     void update(const Target &target, const LoopNest *loop);
 
@@ -42,9 +42,23 @@ struct GPULoopInfo {
 
     std::vector<int64_t> get_inner_serial_loop_extents(const LoopNest *loop_nest) const;
 
-    std::unique_ptr<ThreadInfo> create_thread_info();
+    // assert-fails if create_thread_info() has *already* been called.
+    const ThreadInfo *create_thread_info();
+
+    // Note: if create_thread_info() has not been called yet, this will return nullptr.
+    // (Note that this is an unusual but legitimate situation, so it should *not*
+    // assert-fail if the value is null.)
+    const ThreadInfo *get_thread_info() const {
+        return thread_info.get();
+    }
 
     int64_t get_total_inner_serial_extents_outside_realization(const LoopNest *loop_nest) const;
+
+private:
+    // This is a shared_ptr mainly to allow for an automatic copy ctor to be generated --
+    // it's shared between different GPULoopInfo instances, but that is never visible to
+    // the outside world.
+    std::shared_ptr<const ThreadInfo> thread_info;
 };
 
 }  // namespace Autoscheduler

diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp
@@ -843,8 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         return;
     }
 
-    internal_assert(gpu_loop_info.thread_info != nullptr);
-    const ThreadInfo *thread_info = gpu_loop_info.thread_info;
+    internal_assert(gpu_loop_info.get_thread_info() != nullptr);
+    const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
     bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;
 
     size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
@@ -1243,7 +1243,7 @@ bool LoopNest::has_thread_loop_descendant() const {
 }
 
 void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const {
-    const ThreadInfo *thread_info = gpu_loop_info.thread_info;
+    const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
     features.warp_lane_utilization = thread_info->warp_lane_utilization();
     features.num_active_warps_per_block = thread_info->num_active_warps_per_block;
     features.idle_lane_wastage = thread_info->idle_lane_wastage();
@@ -1267,7 +1267,9 @@ void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params
     auto active_block_hardware_limit = get_active_block_hardware_limit(params);
     auto active_warp_hardware_limit = get_active_warp_hardware_limit(params);
 
-    int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block;
+    const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
+    internal_assert(thread_info != nullptr);
+    int64_t num_warps_per_block = thread_info->num_warps_per_block;
 
     int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism);
 
@@ -1713,10 +1715,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 bool verbose) const {
 
     gpu_loop_info.update(target, this);
-    std::unique_ptr<ThreadInfo> thread_info;
 
     if (is_gpu_thread(target)) {
-        thread_info = gpu_loop_info.create_thread_info();
+        (void)gpu_loop_info.create_thread_info();
     }
 
     int64_t working_set_here = 0;
@@ -1843,7 +1844,6 @@ void LoopNest::compute_features(const FunctionDAG &dag,
             }
 
             c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose);
-
             if (use_memoized_features) {
                 c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id);
                 c->memoize_features(c->features[hash_of_producers], features);
@@ -2363,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                gpu_loop_info.thread_info,
+                                gpu_loop_info.get_thread_info(),
                                 shared_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2394,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                gpu_loop_info.thread_info,
+                                gpu_loop_info.get_thread_info(),
                                 global_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2434,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                gpu_loop_info.thread_info,
+                                gpu_loop_info.get_thread_info(),
                                 local_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2707,7 +2707,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         inlined_feat.outer_parallelism = parallelism;
         inlined_feat.num_blocks = parallelism;
 
-        internal_assert(is_scalar() || gpu_loop_info.thread_info);
+        internal_assert(is_scalar() || gpu_loop_info.get_thread_info());
 
         auto num_warps_per_block = it.value();
         auto num_threads_per_block = 1;
@@ -2716,8 +2716,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         // be surrounded by block/thread/serial loops so there's no need to take
         // them into account when computing these features
         if (!is_scalar()) {
-            num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
-            num_threads_per_block = gpu_loop_info.thread_info->num_threads;
+            num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.get_thread_info()->num_warps_per_block * inlined_feat.num_blocks;
+            num_threads_per_block = gpu_loop_info.get_thread_info()->num_threads;
         }
         inlined_feat.num_warps_per_block += num_warps_per_block;
         inlined_feat.num_threads_per_block += num_threads_per_block;
@@ -4028,11 +4028,9 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No
 
 double LoopNest::max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const {
     gpu_loop_info.update(target, this);
-    std::unique_ptr<ThreadInfo> thread_info;
 
     if (is_gpu_thread(target)) {
-        thread_info = gpu_loop_info.create_thread_info();
-
+        const ThreadInfo *thread_info = gpu_loop_info.create_thread_info();
         return thread_info->idle_lane_wastage();
     }
 

diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp
@@ -353,7 +353,7 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
             new_root->copy_from(*root);
             const auto &nodes = compute_root_nodes.get(node);
             for (const auto &n : nodes) {
-                const auto *compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{});
+                const auto *compute_root_loop = deep_copy_loop_nest(n, NoOpMutator{});
                 new_root->children.emplace_back(compute_root_loop);
             }
             new_root->store_at.insert(node);