Fix fundamental confusion about target/tune CPU (#6765)

* Fix fundamental confusion about target/tune CPU Sooo. Uh, remember when in #6655 we've agreed that we want to add support to precisely specify the CPU for which the code should be *tuned* for, but not *targeted* for. Aka, similar to clang's `-mtune=` option, that does not affect the ISA set selection? So guess what, that's not what we did, apparently. `CodeGen_LLVM::mcpu()` / `halide_mcpu` actually do specify the *target* CPU. It was obvious in retrospect, because e.g. `CodeGen_X86::mattrs()` does not, in fact, ever specify `+avx2`, yet we get AVX2 :) So we've unintentionally added `-march=` support. Oops. While i'd like to add `-march=` support, that was not the goal here. Fixing this is complicated by the fact that `llvm::Target::createTargetMachine()` only takes `CPU Target` string, you can't specify `CPU Tune`. But this is actually a blessing in disguise, because it allows us to fix another bug at the same time: There is a problem with halide "compile to llvm ir assembly", a lot of information from Halide Target is not //really// lowered into LLVM Module, but is embedded as a metadata, that is then extracted by halide `make_target_machine()`. While that is not a problem in itself, it makes it *impossible* to dump the LLVM IR, and manually play with it, because e.g. the CPU [Target] and Attributes (ISA set) are not actually lowered into the form LLVM understands, but are in some halide-specific metadata. So, to fix the first bug, we must lower the CPU Tune into per-function `"tune-cpu"` metadata, and while there we might as well lower `"target-cpu"` and `"target-features"` similarly. * Address review notes * Hopefully silence bogus issue reported by ancient GCC * Call `set_function_attributes_from_halide_target_options()` when JIT compiling * Fix grammar
halide · May 19, 2022 · b5f024f · b5f024f
1 parent 61f6af7
commit b5f024f
Show file tree

Hide file tree

Showing 13 changed files with 137 additions and 72 deletions.
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -72,7 +72,8 @@ class CodeGen_ARM : public CodeGen_Posix {
     };
     vector<Pattern> casts, calls, averagings, negations;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -1392,7 +1393,7 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
     return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
-string CodeGen_ARM::mcpu() const {
+string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
             return "swift";
@@ -1410,6 +1411,10 @@ string CodeGen_ARM::mcpu() const {
     }
 }
 
+string CodeGen_ARM::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_ARM::mattrs() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -42,7 +42,8 @@ class CodeGen_Hexagon : public CodeGen_Posix {
 
     void init_module() override;
 
-    std::string mcpu() const override;
+    std::string mcpu_target() const override;
+    std::string mcpu_tune() const override;
     std::string mattrs() const override;
     int isa_version;
     bool use_soft_float_abi() const override;
@@ -1788,7 +1789,7 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
                                       fn, std::move(args));
 }
 
-string CodeGen_Hexagon::mcpu() const {
+string CodeGen_Hexagon::mcpu_target() const {
     if (target.has_feature(Halide::Target::HVX_v66)) {
         return "hexagonv66";
     } else if (target.has_feature(Halide::Target::HVX_v65)) {
@@ -1798,6 +1799,10 @@ string CodeGen_Hexagon::mcpu() const {
     }
 }
 
+string CodeGen_Hexagon::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_Hexagon::mattrs() const {
     std::stringstream attrs;
     attrs << "+hvx-length128b";

diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
@@ -590,16 +590,15 @@ bool get_md_string(llvm::Metadata *value, std::string &result) {
     return false;
 }
 
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs) {
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options) {
     bool use_soft_float_abi = false;
     get_md_bool(module.getModuleFlag("halide_use_soft_float_abi"), use_soft_float_abi);
-    get_md_string(module.getModuleFlag("halide_mcpu"), mcpu);
-    get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
     std::string mabi;
     get_md_string(module.getModuleFlag("halide_mabi"), mabi);
     bool use_pic = true;
     get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
 
+    // FIXME: can this be migrated into `set_function_attributes_from_halide_target_options()`?
     bool per_instruction_fast_math_flags = false;
     get_md_bool(module.getModuleFlag("halide_per_instruction_fast_math_flags"), per_instruction_fast_math_flags);
 
@@ -629,9 +628,14 @@ void clone_target_options(const llvm::Module &from, llvm::Module &to) {
         to.addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi ? 1 : 0);
     }
 
-    std::string mcpu;
-    if (get_md_string(from.getModuleFlag("halide_mcpu"), mcpu)) {
-        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu", llvm::MDString::get(context, mcpu));
+    std::string mcpu_target;
+    if (get_md_string(from.getModuleFlag("halide_mcpu_target"), mcpu_target)) {
+        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", llvm::MDString::get(context, mcpu_target));
+    }
+
+    std::string mcpu_tune;
+    if (get_md_string(from.getModuleFlag("halide_mcpu_tune"), mcpu_tune)) {
+        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", llvm::MDString::get(context, mcpu_tune));
     }
 
     std::string mattrs;
@@ -657,9 +661,7 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
     internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";
 
     llvm::TargetOptions options;
-    std::string mcpu = "";
-    std::string mattrs = "";
-    get_target_options(module, options, mcpu, mattrs);
+    get_target_options(module, options);
 
     bool use_pic = true;
     get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
@@ -668,18 +670,29 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
     get_md_bool(module.getModuleFlag("halide_use_large_code_model"), use_large_code_model);
 
     auto *tm = llvm_target->createTargetMachine(module.getTargetTriple(),
-                                                mcpu, mattrs,
+                                                /*CPU target=*/"", /*Features=*/"",
                                                 options,
                                                 use_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static,
                                                 use_large_code_model ? llvm::CodeModel::Large : llvm::CodeModel::Small,
                                                 llvm::CodeGenOpt::Aggressive);
     return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
-void set_function_attributes_for_target(llvm::Function *fn, const Target &t) {
+void set_function_attributes_from_halide_target_options(llvm::Function &fn) {
+    llvm::Module &module = *fn.getParent();
+
+    std::string mcpu_target, mcpu_tune, mattrs;
+    get_md_string(module.getModuleFlag("halide_mcpu_target"), mcpu_target);
+    get_md_string(module.getModuleFlag("halide_mcpu_tune"), mcpu_tune);
+    get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
+
+    fn.addFnAttr("target-cpu", mcpu_target);
+    fn.addFnAttr("tune-cpu", mcpu_tune);
+    fn.addFnAttr("target-features", mattrs);
+
     // Turn off approximate reciprocals for division. It's too
     // inaccurate even for us.
-    fn->addFnAttr("reciprocal-estimates", "none");
+    fn.addFnAttr("reciprocal-estimates", "none");
 }
 
 void embed_bitcode(llvm::Module *M, const string &halide_command) {

diff --git a/src/CodeGen_Internal.h b/src/CodeGen_Internal.h
@@ -92,17 +92,17 @@ Expr lower_signed_shift_right(const Expr &a, const Expr &b);
 /** Reduce a mux intrinsic to a select tree */
 Expr lower_mux(const Call *mux);
 
-/** Given an llvm::Module, set llvm:TargetOptions, cpu and attr information */
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs);
+/** Given an llvm::Module, set llvm:TargetOptions information */
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options);
 
 /** Given two llvm::Modules, clone target options from one to the other */
 void clone_target_options(const llvm::Module &from, llvm::Module &to);
 
 /** Given an llvm::Module, get or create an llvm:TargetMachine */
 std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &module);
 
-/** Set the appropriate llvm Function attributes given a Target. */
-void set_function_attributes_for_target(llvm::Function *, const Target &);
+/** Set the appropriate llvm Function attributes given the Halide Target. */
+void set_function_attributes_from_halide_target_options(llvm::Function &);
 
 /** Save a copy of the llvm IR currently represented by the module as
  * data in the __LLVM,__bitcode section. Emulates clang's

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -455,7 +455,8 @@ void CodeGen_LLVM::init_codegen(const std::string &name, bool any_strict_float)
 
     // Add some target specific info to the module as metadata.
     module->addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi() ? 1 : 0);
-    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu", MDString::get(*context, mcpu()));
+    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", MDString::get(*context, mcpu_target()));
+    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", MDString::get(*context, mcpu_tune()));
     module->addModuleFlag(llvm::Module::Warning, "halide_mattrs", MDString::get(*context, mattrs()));
     module->addModuleFlag(llvm::Module::Warning, "halide_mabi", MDString::get(*context, mabi()));
     module->addModuleFlag(llvm::Module::Warning, "halide_use_pic", use_pic() ? 1 : 0);
@@ -523,7 +524,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
         }
         FunctionType *func_t = FunctionType::get(i32_t, arg_types, false);
         function = llvm::Function::Create(func_t, llvm_linkage(f.linkage), names.extern_name, module.get());
-        set_function_attributes_for_target(function, target);
+        set_function_attributes_from_halide_target_options(*function);
 
         // Mark the buffer args as no alias and save indication for add_argv_wrapper if needed
         std::vector<bool> buffer_args(f.args.size());
@@ -564,6 +565,8 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::finish_codegen() {
+    llvm::for_each(*module, set_function_attributes_from_halide_target_options);
+
     // Verify the module is ok
     internal_assert(!verifyModule(*module, &llvm::errs()));
     debug(2) << "Done generating llvm bitcode\n";

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -106,11 +106,21 @@ class CodeGen_LLVM : public IRVisitor {
     virtual void end_func(const std::vector<LoweredArgument> &args);
     // @}
 
-    /** What should be passed as -mcpu, -mattrs, and related for
-     * compilation. The architecture-specific code generator should
-     * define these. */
+    /** What should be passed as -mcpu (warning: implies attrs!), -mattrs,
+     *  and related for compilation. The architecture-specific code generator
+     *  should define these.
+     *
+     *  `mcpu_target()` - target this specific CPU, in the sense of the allowed
+     *  ISA sets *and* the CPU-specific tuning/assembly instruction scheduling.
+     *
+     *  `mcpu_tune()` - expect that we will be running on this specific CPU,
+     *  so perform CPU-specific tuning/assembly instruction scheduling, *but*
+     *  DON'T sacrifice the portability, support running on other CPUs, only
+     *  make use of the ISAs that are enabled by `mcpu_target()`+`mattrs()`.
+     */
     // @{
-    virtual std::string mcpu() const = 0;
+    virtual std::string mcpu_target() const = 0;
+    virtual std::string mcpu_tune() const = 0;
     virtual std::string mattrs() const = 0;
     virtual std::string mabi() const;
     virtual bool use_soft_float_abi() const = 0;

diff --git a/src/CodeGen_MIPS.cpp b/src/CodeGen_MIPS.cpp
@@ -19,7 +19,8 @@ class CodeGen_MIPS : public CodeGen_Posix {
 protected:
     using CodeGen_Posix::visit;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -29,14 +30,18 @@ CodeGen_MIPS::CodeGen_MIPS(const Target &t)
     : CodeGen_Posix(t) {
 }
 
-string CodeGen_MIPS::mcpu() const {
+string CodeGen_MIPS::mcpu_target() const {
     if (target.bits == 32) {
         return "";
     } else {
         return "";
     }
 }
 
+string CodeGen_MIPS::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_MIPS::mattrs() const {
     if (target.bits == 32) {
         return "";

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -91,7 +91,8 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
     // @}
 
     std::string march() const;
-    std::string mcpu() const override;
+    std::string mcpu_target() const override;
+    std::string mcpu_tune() const override;
     std::string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -153,7 +154,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     // Make our function
     FunctionType *func_t = FunctionType::get(void_t, arg_types, false);
     function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
-    set_function_attributes_for_target(function, target);
+    set_function_attributes_from_halide_target_options(*function);
 
     // Mark the buffer args as no alias
     for (size_t i = 0; i < args.size(); i++) {
@@ -542,7 +543,7 @@ string CodeGen_PTX_Dev::march() const {
     return "nvptx64";
 }
 
-string CodeGen_PTX_Dev::mcpu() const {
+string CodeGen_PTX_Dev::mcpu_target() const {
     if (target.has_feature(Target::CUDACapability86)) {
         return "sm_86";
     } else if (target.has_feature(Target::CUDACapability80)) {
@@ -566,6 +567,10 @@ string CodeGen_PTX_Dev::mcpu() const {
     }
 }
 
+string CodeGen_PTX_Dev::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_PTX_Dev::mattrs() const {
     if (target.has_feature(Target::CUDACapability86)) {
         return "+ptx71";
@@ -617,7 +622,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
 
     std::unique_ptr<TargetMachine>
         target_machine(llvm_target->createTargetMachine(triple.str(),
-                                                        mcpu(), mattrs(), options,
+                                                        mcpu_target(), mattrs(), options,
                                                         llvm::Reloc::PIC_,
                                                         llvm::CodeModel::Small,
                                                         CodeGenOpt::Aggressive));
@@ -758,7 +763,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         f.write(buffer.data(), buffer.size());
         f.close();
 
-        string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
+        string cmd = "ptxas --gpu-name " + mcpu_target() + " " + ptx.pathname() + " -o " + sass.pathname();
         if (system(cmd.c_str()) == 0) {
             cmd = "nvdisasm " + sass.pathname();
             int ret = system(cmd.c_str());

diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
@@ -22,7 +22,8 @@ class CodeGen_PowerPC : public CodeGen_Posix {
 protected:
     void init_module() override;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -141,7 +142,7 @@ void CodeGen_PowerPC::visit(const Max *op) {
     return CodeGen_Posix::visit(op);
 }
 
-string CodeGen_PowerPC::mcpu() const {
+string CodeGen_PowerPC::mcpu_target() const {
     if (target.bits == 32) {
         return "ppc32";
     } else {
@@ -155,6 +156,10 @@ string CodeGen_PowerPC::mcpu() const {
     }
 }
 
+string CodeGen_PowerPC::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_PowerPC::mattrs() const {
     string features;
     string separator;

diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
@@ -19,7 +19,8 @@ class CodeGen_RISCV : public CodeGen_Posix {
 protected:
     using CodeGen_Posix::visit;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     string mabi() const override;
     bool use_soft_float_abi() const override;
@@ -30,10 +31,14 @@ CodeGen_RISCV::CodeGen_RISCV(const Target &t)
     : CodeGen_Posix(t) {
 }
 
-string CodeGen_RISCV::mcpu() const {
+string CodeGen_RISCV::mcpu_target() const {
     return "";
 }
 
+string CodeGen_RISCV::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_RISCV::mattrs() const {
     // Note: the default march is "rv[32|64]imafdc",
     // which includes standard extensions:

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
@@ -29,7 +29,8 @@ class CodeGen_WebAssembly : public CodeGen_Posix {
 
     void init_module() override;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -256,10 +257,14 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
-string CodeGen_WebAssembly::mcpu() const {
+string CodeGen_WebAssembly::mcpu_target() const {
     return "";
 }
 
+string CodeGen_WebAssembly::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_WebAssembly::mattrs() const {
     std::ostringstream s;
     string sep;