Skip to content

Commit

Permalink
Attempt to fix halide#7703 (halide#7706)
Browse files Browse the repository at this point in the history
* Attempt to fix halide#7703

* fixes

* Update LoopNest.cpp

* Update GPULoopInfo.h

* Fixes.

* clang-tidy
  • Loading branch information
steven-johnson authored and ardier committed Mar 3, 2024
1 parent 6ac92a7 commit 6a33c9b
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 23 deletions.
8 changes: 4 additions & 4 deletions src/autoschedulers/anderson2021/GPULoopInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,18 @@ int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const Lo
return extents;
}

std::unique_ptr<ThreadInfo> GPULoopInfo::create_thread_info() {
const ThreadInfo *GPULoopInfo::create_thread_info() {
internal_assert(at_or_inside_block());
internal_assert(at_or_inside_thread());
internal_assert(thread_info == nullptr) << "create_thread_info() should not be called twice";

auto max_thread_counts = current_block_loop->get_union_thread_counts(nullptr);
std::unique_ptr<ThreadInfo> new_thread_info = std::make_unique<ThreadInfo>(
thread_info = std::make_shared<const ThreadInfo>(
current_thread_loop->vectorized_loop_index,
current_thread_loop->size,
current_thread_loop->stage->loop,
max_thread_counts);
thread_info = new_thread_info.get();
return new_thread_info;
return thread_info.get();
}

} // namespace Autoscheduler
Expand Down
18 changes: 16 additions & 2 deletions src/autoschedulers/anderson2021/GPULoopInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* hierarchy of blocks, threads, etc. Useful when computing GPU features
*/

#include <memory>
#include <vector>

#include "Halide.h"
Expand All @@ -30,7 +31,6 @@ struct GPULoopInfo {
int64_t num_blocks = 1;
int64_t total_outer_serial_extents = 1;
int64_t total_inner_serial_extents = 1;
const ThreadInfo *thread_info = nullptr;

void update(const Target &target, const LoopNest *loop);

Expand All @@ -42,9 +42,23 @@ struct GPULoopInfo {

std::vector<int64_t> get_inner_serial_loop_extents(const LoopNest *loop_nest) const;

std::unique_ptr<ThreadInfo> create_thread_info();
// assert-fails if create_thread_info() has *already* been called.
const ThreadInfo *create_thread_info();

// Note: if create_thread_info() has not been called yet, this will return nullptr.
// (Note that this is an unusual but legitimate situation, so it should *not*
// assert-fail if the value is null.)
const ThreadInfo *get_thread_info() const {
return thread_info.get();
}

int64_t get_total_inner_serial_extents_outside_realization(const LoopNest *loop_nest) const;

private:
// This is a shared_ptr mainly to allow for an automatic copy ctor to be generated --
// it's shared between different GPULoopInfo instances, but that is never visible to
// the outside world.
std::shared_ptr<const ThreadInfo> thread_info;
};

} // namespace Autoscheduler
Expand Down
30 changes: 14 additions & 16 deletions src/autoschedulers/anderson2021/LoopNest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -843,8 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
return;
}

internal_assert(gpu_loop_info.thread_info != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
internal_assert(gpu_loop_info.get_thread_info() != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;

size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
Expand Down Expand Up @@ -1243,7 +1243,7 @@ bool LoopNest::has_thread_loop_descendant() const {
}

void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const {
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
features.warp_lane_utilization = thread_info->warp_lane_utilization();
features.num_active_warps_per_block = thread_info->num_active_warps_per_block;
features.idle_lane_wastage = thread_info->idle_lane_wastage();
Expand All @@ -1267,7 +1267,9 @@ void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params
auto active_block_hardware_limit = get_active_block_hardware_limit(params);
auto active_warp_hardware_limit = get_active_warp_hardware_limit(params);

int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block;
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
internal_assert(thread_info != nullptr);
int64_t num_warps_per_block = thread_info->num_warps_per_block;

int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism);

Expand Down Expand Up @@ -1713,10 +1715,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
bool verbose) const {

gpu_loop_info.update(target, this);
std::unique_ptr<ThreadInfo> thread_info;

if (is_gpu_thread(target)) {
thread_info = gpu_loop_info.create_thread_info();
(void)gpu_loop_info.create_thread_info();
}

int64_t working_set_here = 0;
Expand Down Expand Up @@ -1843,7 +1844,6 @@ void LoopNest::compute_features(const FunctionDAG &dag,
}

c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose);

if (use_memoized_features) {
c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id);
c->memoize_features(c->features[hash_of_producers], features);
Expand Down Expand Up @@ -2363,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
shared_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2394,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
global_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2434,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
local_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2707,7 +2707,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
inlined_feat.outer_parallelism = parallelism;
inlined_feat.num_blocks = parallelism;

internal_assert(is_scalar() || gpu_loop_info.thread_info);
internal_assert(is_scalar() || gpu_loop_info.get_thread_info());

auto num_warps_per_block = it.value();
auto num_threads_per_block = 1;
Expand All @@ -2716,8 +2716,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
// be surrounded by block/thread/serial loops so there's no need to take
// them into account when computing these features
if (!is_scalar()) {
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.thread_info->num_threads;
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.get_thread_info()->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.get_thread_info()->num_threads;
}
inlined_feat.num_warps_per_block += num_warps_per_block;
inlined_feat.num_threads_per_block += num_threads_per_block;
Expand Down Expand Up @@ -4028,11 +4028,9 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No

double LoopNest::max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const {
gpu_loop_info.update(target, this);
std::unique_ptr<ThreadInfo> thread_info;

if (is_gpu_thread(target)) {
thread_info = gpu_loop_info.create_thread_info();

const ThreadInfo *thread_info = gpu_loop_info.create_thread_info();
return thread_info->idle_lane_wastage();
}

Expand Down
2 changes: 1 addition & 1 deletion src/autoschedulers/anderson2021/SearchSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
new_root->copy_from(*root);
const auto &nodes = compute_root_nodes.get(node);
for (const auto &n : nodes) {
const auto *compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{});
const auto *compute_root_loop = deep_copy_loop_nest(n, NoOpMutator{});
new_root->children.emplace_back(compute_root_loop);
}
new_root->store_at.insert(node);
Expand Down

0 comments on commit 6a33c9b

Please sign in to comment.