Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GPU depredication/scalarization #6669

Merged
merged 11 commits into from
Apr 1, 2022
15 changes: 7 additions & 8 deletions src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <sstream>
#include <utility>

#include "CodeGen_C.h"
#include "CodeGen_D3D12Compute_Dev.h"
#include "CodeGen_GPU_Dev.h"
#include "CodeGen_Internal.h"
Expand Down Expand Up @@ -62,10 +61,10 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
protected:
friend struct StoragePackUnpack;

class CodeGen_D3D12Compute_C : public CodeGen_C {
class CodeGen_D3D12Compute_C : public CodeGen_GPU_C {
public:
CodeGen_D3D12Compute_C(std::ostream &s, const Target &t)
: CodeGen_C(s, t) {
: CodeGen_GPU_C(s, t) {
integer_suffix_style = IntegerSuffixStyle::HLSL;
}
void add_kernel(Stmt stmt,
Expand All @@ -88,7 +87,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {

std::string print_assignment(Type t, const std::string &rhs) override;

using CodeGen_C::visit;
using CodeGen_GPU_C::visit;
void visit(const Evaluate *op) override;
void visit(const Min *) override;
void visit(const Max *) override;
Expand Down Expand Up @@ -303,7 +302,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) {

if (!is_gpu_var(loop->name)) {
user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n";
CodeGen_C::visit(loop);
CodeGen_GPU_C::visit(loop);
return;
}

Expand Down Expand Up @@ -380,7 +379,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Call *op) {
// directly.
stream << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
} else {
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}
}

Expand Down Expand Up @@ -815,7 +814,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Free *op) {

string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_assignment(Type type, const string &rhs) {
string rhs_modified = print_reinforced_cast(type, rhs);
return CodeGen_C::print_assignment(type, rhs_modified);
return CodeGen_GPU_C::print_assignment(type, rhs_modified);
}

string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_vanilla_cast(Type type, const string &value_expr) {
Expand Down Expand Up @@ -964,7 +963,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op)
// have seen division-by-zero shader warnings, and we postulated that it
// could be indirectly related to compiler assumptions on signed integer
// overflow when float_from_bits() is called, but we don't know for sure
return CodeGen_C::visit(op);
return CodeGen_GPU_C::visit(op);
}

void CodeGen_D3D12Compute_Dev::add_kernel(Stmt s,
Expand Down
68 changes: 60 additions & 8 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator {
mutate(extract_lane(s->index, ln)),
s->param,
const_true(),
// TODO: alignment needs to be changed
s->alignment)));
s->alignment + ln)));
}
return Block::make(scalar_stmts);
} else {
Expand All @@ -127,12 +126,23 @@ class ScalarizePredicatedLoadStore : public IRMutator {

Expr visit(const Load *op) override {
if (!is_const_one(op->predicate)) {
Expr load_expr = Load::make(op->type, op->name, op->index, op->image,
op->param, const_true(op->type.lanes()), op->alignment);
Expr pred_load = Call::make(load_expr.type(),
Call::if_then_else,
{op->predicate, load_expr},
Internal::Call::PureIntrinsic);
std::vector<Expr> lane_values;
for (int ln = 0; ln < op->type.lanes(); ln++) {
Expr load_expr = Load::make(op->type.element_of(),
op->name,
extract_lane(op->index, ln),
op->image,
op->param,
const_true(),
op->alignment + ln);
lane_values.push_back(Call::make(load_expr.type(),
Call::if_then_else,
{extract_lane(op->predicate, ln),
load_expr,
make_zero(op->type.element_of())},
Internal::Call::PureIntrinsic));
}
Expr pred_load = Shuffle::make_concat(lane_values);
return pred_load;
} else {
return op;
Expand All @@ -147,5 +157,47 @@ Stmt CodeGen_GPU_Dev::scalarize_predicated_loads_stores(Stmt &s) {
return sps.mutate(s);
}

void CodeGen_GPU_C::visit(const Shuffle *op) {
if (op->type.is_scalar()) {
CodeGen_C::visit(op);
} else {
internal_assert(!op->vectors.empty());
for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= 0 && i < max_index);
}

std::vector<std::string> vecs;
for (const Expr &v : op->vectors) {
vecs.push_back(print_expr(v));
}

std::string src = vecs[0];
std::ostringstream rhs;
std::string storage_name = unique_name('_');
if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
rhs << "(" << print_type(op->type) << ")(";
} else {
rhs << "{";
}
for (int i : op->indices) {
rhs << vecs[i];
if (i < (int)(op->indices.size() - 1)) {
rhs << ", ";
}
}
if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
rhs << ")";
} else {
rhs << "}";
}
print_assignment(op->type, rhs.str());
}
}

} // namespace Internal
} // namespace Halide
27 changes: 25 additions & 2 deletions src/CodeGen_GPU_Dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <string>
#include <vector>

#include "CodeGen_C.h"
#include "DeviceArgument.h"
#include "Expr.h"

Expand Down Expand Up @@ -73,15 +74,37 @@ struct CodeGen_GPU_Dev {
static Stmt scalarize_predicated_loads_stores(Stmt &s);

/** An mask describing which type of memory fence to use for the gpu_thread_barrier()
* intrinsic. Not all GPUs APIs support all types.
*/
* intrinsic. Not all GPUs APIs support all types.
*/
enum MemoryFenceType {
None = 0, // No fence required (just a sync)
Device = 1, // Device/global memory fence
Shared = 2 // Threadgroup/shared memory fence
};
};

/** A base class for GPU backends that require C-like shader output.
* GPU backends derive from and specialize this class. */
class CodeGen_GPU_C : public CodeGen_C {
public:
/** OpenCL uses a different syntax than C for immediate vectors. This
enum defines which style should be used by the backend. */
enum class VectorDeclarationStyle {
CLikeSyntax = 0,
OpenCLSyntax = 1
};

CodeGen_GPU_C(std::ostream &s, Target t)
: CodeGen_C(s, t) {
}

protected:
using CodeGen_C::visit;
void visit(const Shuffle *op) override;

VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax;
};

} // namespace Internal
} // namespace Halide

Expand Down
11 changes: 5 additions & 6 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <sstream>
#include <utility>

#include "CodeGen_C.h"
#include "CodeGen_GPU_Dev.h"
#include "CodeGen_Internal.h"
#include "CodeGen_Metal_Dev.h"
Expand Down Expand Up @@ -50,17 +49,17 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
}

protected:
class CodeGen_Metal_C : public CodeGen_C {
class CodeGen_Metal_C : public CodeGen_GPU_C {
public:
CodeGen_Metal_C(std::ostream &s, const Target &t)
: CodeGen_C(s, t) {
: CodeGen_GPU_C(s, t) {
}
void add_kernel(const Stmt &stmt,
const std::string &name,
const std::vector<DeviceArgument> &args);

protected:
using CodeGen_C::visit;
using CodeGen_GPU_C::visit;
std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
// Vectors in Metal come in two varieties, regular and packed.
// For storage allocations and pointers used in address arithmetic,
Expand Down Expand Up @@ -267,7 +266,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) {

} else {
user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside Metal kernel\n";
CodeGen_C::visit(loop);
CodeGen_GPU_C::visit(loop);
}
}

Expand Down Expand Up @@ -321,7 +320,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Call *op) {
stream << ");\n";
print_assignment(op->type, "0");
} else {
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}
}

Expand Down
35 changes: 24 additions & 11 deletions src/CodeGen_OpenCL_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <utility>

#include "CSE.h"
#include "CodeGen_C.h"
#include "CodeGen_GPU_Dev.h"
#include "CodeGen_Internal.h"
#include "CodeGen_OpenCL_Dev.h"
Expand Down Expand Up @@ -55,18 +54,19 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev {
}

protected:
class CodeGen_OpenCL_C : public CodeGen_C {
class CodeGen_OpenCL_C : public CodeGen_GPU_C {
public:
CodeGen_OpenCL_C(std::ostream &s, Target t)
: CodeGen_C(s, t) {
: CodeGen_GPU_C(s, t) {
integer_suffix_style = IntegerSuffixStyle::OpenCL;
vector_declaration_style = VectorDeclarationStyle::OpenCLSyntax;
}
void add_kernel(Stmt stmt,
const std::string &name,
const std::vector<DeviceArgument> &args);

protected:
using CodeGen_C::visit;
using CodeGen_GPU_C::visit;
std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
std::string print_reinterpret(Type type, const Expr &e) override;
std::string print_extern_call(const Call *op) override;
Expand Down Expand Up @@ -223,7 +223,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) {

} else {
user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n";
CodeGen_C::visit(loop);
CodeGen_GPU_C::visit(loop);
}
}

Expand Down Expand Up @@ -351,7 +351,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
print_assignment(op->type, a0 + " >> " + a1);
}
} else {
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}
} else if (op->is_intrinsic(Call::image_load)) {
// image_load(<image name>, <buffer>, <x>, <x-extent>, <y>,
Expand Down Expand Up @@ -455,7 +455,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
stream << write_image.str();
}
} else {
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}
}

Expand Down Expand Up @@ -743,7 +743,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) {
if (op->type.is_vector()) {
print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")");
} else {
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}
}

Expand All @@ -755,7 +755,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) {
equiv.accept(this);
return;
}
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}

void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) {
Expand Down Expand Up @@ -858,8 +858,14 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
}
stream << ");\n";
}
} else if (op->is_extract_element()) {
// OpenCL requires using .s<n> format for extracting an element
ostringstream rhs;
rhs << print_expr(op->vectors[0]);
rhs << ".s" << op->indices[0];
print_assignment(op->type, rhs.str());
} else {
internal_error << "Shuffle not implemented.\n";
CodeGen_GPU_C::visit(op);
}
}

Expand All @@ -879,7 +885,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) {

// Issue atomic stores.
ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
CodeGen_C::visit(op);
CodeGen_GPU_C::visit(op);
}

void CodeGen_OpenCL_Dev::add_kernel(Stmt s,
Expand Down Expand Up @@ -926,6 +932,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
debug(2) << "After eliminating bool vectors:\n"
<< s << "\n";

// We need to scalarize/de-predicate any loads/stores, since OpenCL does not
// support predication.
s = scalarize_predicated_loads_stores(s);

debug(2) << "After removing predication: \n"
<< s;

// Figure out which arguments should be passed in __constant.
// Such arguments should be:
// - not written to,
Expand Down
Loading