Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to fix #7703 #7706

Merged
merged 9 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/autoschedulers/anderson2021/GPULoopInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,18 @@ int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const Lo
return extents;
}

std::unique_ptr<ThreadInfo> GPULoopInfo::create_thread_info() {
const ThreadInfo *GPULoopInfo::create_thread_info() {
internal_assert(at_or_inside_block());
internal_assert(at_or_inside_thread());
internal_assert(thread_info == nullptr) << "create_thread_info() should not be called twice";

auto max_thread_counts = current_block_loop->get_union_thread_counts(nullptr);
std::unique_ptr<ThreadInfo> new_thread_info = std::make_unique<ThreadInfo>(
thread_info = std::make_shared<const ThreadInfo>(
current_thread_loop->vectorized_loop_index,
current_thread_loop->size,
current_thread_loop->stage->loop,
max_thread_counts);
thread_info = new_thread_info.get();
return new_thread_info;
return thread_info.get();
}

} // namespace Autoscheduler
Expand Down
18 changes: 16 additions & 2 deletions src/autoschedulers/anderson2021/GPULoopInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* hierarchy of blocks, threads, etc. Useful when computing GPU features
*/

#include <memory>
#include <vector>

#include "Halide.h"
Expand All @@ -30,7 +31,6 @@ struct GPULoopInfo {
int64_t num_blocks = 1;
int64_t total_outer_serial_extents = 1;
int64_t total_inner_serial_extents = 1;
const ThreadInfo *thread_info = nullptr;

void update(const Target &target, const LoopNest *loop);

Expand All @@ -42,9 +42,23 @@ struct GPULoopInfo {

std::vector<int64_t> get_inner_serial_loop_extents(const LoopNest *loop_nest) const;

std::unique_ptr<ThreadInfo> create_thread_info();
// assert-fails if create_thread_info() has *already* been called.
const ThreadInfo *create_thread_info();

// Note: if create_thread_info() has not been called yet, this will return nullptr.
// (Note that this is an unusual but legitimate situation, so it should *not*
// assert-fail if the value is null.)
const ThreadInfo *get_thread_info() const {
steven-johnson marked this conversation as resolved.
Show resolved Hide resolved
return thread_info.get();
}

int64_t get_total_inner_serial_extents_outside_realization(const LoopNest *loop_nest) const;

private:
// This is a shared_ptr mainly to allow for an automatic copy ctor to be generated --
// it's shared between different GPULoopInfo instances, but that is never visible to
// the outside world.
std::shared_ptr<const ThreadInfo> thread_info;
};

} // namespace Autoscheduler
Expand Down
30 changes: 14 additions & 16 deletions src/autoschedulers/anderson2021/LoopNest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -843,8 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
return;
}

internal_assert(gpu_loop_info.thread_info != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
internal_assert(gpu_loop_info.get_thread_info() != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;

size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
Expand Down Expand Up @@ -1243,7 +1243,7 @@ bool LoopNest::has_thread_loop_descendant() const {
}

void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const {
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
features.warp_lane_utilization = thread_info->warp_lane_utilization();
features.num_active_warps_per_block = thread_info->num_active_warps_per_block;
features.idle_lane_wastage = thread_info->idle_lane_wastage();
Expand All @@ -1267,7 +1267,9 @@ void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params
auto active_block_hardware_limit = get_active_block_hardware_limit(params);
auto active_warp_hardware_limit = get_active_warp_hardware_limit(params);

int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block;
const ThreadInfo *thread_info = gpu_loop_info.get_thread_info();
internal_assert(thread_info != nullptr);
int64_t num_warps_per_block = thread_info->num_warps_per_block;

int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism);

Expand Down Expand Up @@ -1713,10 +1715,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
bool verbose) const {

gpu_loop_info.update(target, this);
std::unique_ptr<ThreadInfo> thread_info;

if (is_gpu_thread(target)) {
thread_info = gpu_loop_info.create_thread_info();
(void)gpu_loop_info.create_thread_info();
}

int64_t working_set_here = 0;
Expand Down Expand Up @@ -1843,7 +1844,6 @@ void LoopNest::compute_features(const FunctionDAG &dag,
}

c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose);

if (use_memoized_features) {
c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id);
c->memoize_features(c->features[hash_of_producers], features);
Expand Down Expand Up @@ -2363,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
shared_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2394,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
global_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2434,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
gpu_loop_info.thread_info,
gpu_loop_info.get_thread_info(),
local_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2707,7 +2707,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
inlined_feat.outer_parallelism = parallelism;
inlined_feat.num_blocks = parallelism;

internal_assert(is_scalar() || gpu_loop_info.thread_info);
internal_assert(is_scalar() || gpu_loop_info.get_thread_info());

auto num_warps_per_block = it.value();
auto num_threads_per_block = 1;
Expand All @@ -2716,8 +2716,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
// be surrounded by block/thread/serial loops so there's no need to take
// them into account when computing these features
if (!is_scalar()) {
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.thread_info->num_threads;
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.get_thread_info()->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.get_thread_info()->num_threads;
}
inlined_feat.num_warps_per_block += num_warps_per_block;
inlined_feat.num_threads_per_block += num_threads_per_block;
Expand Down Expand Up @@ -4028,11 +4028,9 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No

double LoopNest::max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const {
gpu_loop_info.update(target, this);
std::unique_ptr<ThreadInfo> thread_info;

if (is_gpu_thread(target)) {
thread_info = gpu_loop_info.create_thread_info();

const ThreadInfo *thread_info = gpu_loop_info.create_thread_info();
return thread_info->idle_lane_wastage();
}

Expand Down
2 changes: 1 addition & 1 deletion src/autoschedulers/anderson2021/SearchSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
new_root->copy_from(*root);
const auto &nodes = compute_root_nodes.get(node);
for (const auto &n : nodes) {
const auto *compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{});
const auto *compute_root_loop = deep_copy_loop_nest(n, NoOpMutator{});
new_root->children.emplace_back(compute_root_loop);
}
new_root->store_at.insert(node);
Expand Down