Skip to content

Commit

Permalink
[mlir][gpu] Improving Cubin Serialization with ptxas Compiler
Browse files Browse the repository at this point in the history
This work improves how we compile the generated PTX code using the `ptxas` compiler. Currently, we rely on the driver's jit API to compile the PTX code. However, this approach has some limitations. It doesn't always produce the same binary output as the ptxas compiler, leading to potential inconsistencies in the generated Cubin files.

This work introduces a significant improvement by directly utilizing the ptxas compiler for PTX compilation. By doing so, we can achieve more consistent and reliable results in generating cubin files. Key Benefits:
- Using the Ptxas compiler directly ensures that the cubin files generated during the build process remain consistent with CUDA compilation using `nvcc` or `clang`.
- Another advantage of this work is that it allows developers to experiment with different ptxas compilers without the need to change the compiler. Performance among ptxas compiler versions are vary, therefore, one can easily try different ptxas compilers.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D155563
  • Loading branch information
grypp committed Jul 24, 2023
1 parent 106bde9 commit 585cbe3
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 28 deletions.
42 changes: 37 additions & 5 deletions mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,32 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
}

namespace gpu {

/// Options for Serialization
struct SerializationToCubinOptions {
/// LLVM target triple
std::string triple;

/// SM Architecture of the GPU
std::string chip;

/// PTX version that is wanted to produce
std::string features;

/// Optimization level
int optLevel = 2;

/// Dump generated PTX to stderr for debug purposes
bool dumpPtx = false;

/// Compiles generated PTX by ptxas compiler. When it is false, the generated
/// PTX is compilet by JIT compielr by the driver.
bool usePtxas = true;

/// Parameters to pass ptxas compiler. It is ignored for JIT compiler.
std::string ptxasParams;
};

/// Base pass class to serialize kernel functions through LLVM into
/// user-specified IR and add the resulting blob as module attribute.
class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
Expand Down Expand Up @@ -117,9 +143,18 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
*this, "gpu-binary-annotation",
llvm::cl::desc("Annotation attribute string for GPU binary"),
llvm::cl::init(getDefaultGpuBinaryAnnotation())};

Option<bool> dumpPtx{*this, "dump-ptx",
::llvm::cl::desc("Dump generated PTX"),
llvm::cl::init(false)};

Option<bool> usePtxas{
*this, "use-ptxas",
::llvm::cl::desc("Compile generated PTX by ptxas compiler"),
llvm::cl::init(true)};
Option<std::string> ptxasParams{
*this, "ptxas-params",
::llvm::cl::desc("Parameters to pass ptxas compiler")};
};
} // namespace gpu

Expand All @@ -137,11 +172,8 @@ void registerGpuSerializeToHsacoPass();

/// Create an instance of the GPU kernel function to CUBIN binary serialization
/// pass with optLevel (default level 2).
std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
StringRef chip,
StringRef features,
int optLevel = 2,
bool dumpPtx = false);
std::unique_ptr<Pass>
createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options);

/// Create an instance of the GPU kernel function to HSAco binary serialization
/// pass.
Expand Down
171 changes: 152 additions & 19 deletions mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "llvm/Support/Debug.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/WithColor.h"
#include "llvm/Support/raw_ostream.h"

#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
#include "mlir/Pass/Pass.h"
Expand All @@ -36,6 +43,106 @@ static void emitCudaError(const llvm::Twine &expr, const char *buffer,
.concat("]"));
}

static constexpr char kPtxasCompilerName[] = "ptxas";

/// Compiles the given generated PTX code with the given ptxas compiler.
static FailureOr<std::string>
compileWithPtxas(StringRef smCapability, StringRef ptxasParams,
StringRef ptxSource, bool dumpPtx, std::string *message) {
// Step 0. Find ptxas compiler
std::optional<std::string> ptxasCompiler =
llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName);
if (!ptxasCompiler.has_value())
return failure();

// Step 1. Create temporary files: ptx source file, log file and cubin file
llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile);
llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
llvm::FileRemover stdinRemover(stdinFile.c_str());
llvm::FileRemover stdoutRemover(stdoutFile.c_str());
llvm::FileRemover stderrRemover(stderrFile.c_str());
llvm::FileRemover binRemover(cubinFile.c_str());
llvm::FileRemover srcRemover(ptxSourceFile.c_str());

// Step 2. Write the generated PTX into a file, so we can pass it to ptxas
// compiler
std::error_code ec;
llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
fPtxSource << ptxSource;
fPtxSource.close();
if (fPtxSource.has_error()) {
*message = std::string(
"Could not write the generated ptx into a temporary file\n");
return failure();
}

// Step 3. Build the ptxas command line
std::vector<StringRef> argVector{StringRef("ptxas"), StringRef("-arch"),
smCapability, StringRef(ptxSourceFile),
StringRef("-o"), StringRef(cubinFile)};
#ifdef _WIN32
auto tokenize = llvm::cl::TokenizeWindowsCommandLine;
#else
auto tokenize = llvm::cl::TokenizeGNUCommandLine;
#endif // _WIN32
llvm::BumpPtrAllocator scratchAllocator;
llvm::StringSaver stringSaver(scratchAllocator);
SmallVector<const char *> rawArgs;
tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
for (const auto *rawArg : rawArgs)
argVector.emplace_back(rawArg);

std::optional<StringRef> redirects[] = {
stdinFile.str(),
stdoutFile.str(),
stderrFile.str(),
};

// Step 4. Invoke ptxas
if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(),
llvm::ArrayRef<llvm::StringRef>(argVector),
/*Env=*/std::nullopt,
/*Redirects=*/redirects,
/*SecondsToWait=*/0,
/*MemoryLimit=*/0,
/*ErrMsg=*/message)) {
if (message->empty()) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeErrorlog =
llvm::MemoryBuffer::getFile(stderrFile);
*message = std::string("Invoking ptxas is failed, see the file: ");
if (maybeErrorlog)
*message += maybeErrorlog->get()->getBuffer().str();
}
stderrRemover.releaseFile();
return failure();
}

// Step 5. The output of ptxas if verbose flag is set. This is useful
// because it shows local memory usage, register usage, and etc.
if (dumpPtx) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
llvm::MemoryBuffer::getFile(stderrFile);
if (maybeFlog) {
llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
}
}

// Step 6. Read the cubin file, and return. It will eventually be written
// into executable.
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
llvm::MemoryBuffer::getFile(cubinFile);
if (!maybeFcubin) {
*message = std::string("Could not read cubin file \n");
return failure();
}

return std::string(maybeFcubin->get()->getBuffer());
}

#define RETURN_ON_CUDA_ERROR(expr) \
do { \
if (auto status = (expr)) { \
Expand All @@ -54,11 +161,13 @@ class SerializeToCubinPass

SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
StringRef chip = "sm_35", StringRef features = "+ptx60",
int optLevel = 2, bool dumpPtx = false);
int optLevel = 2, bool dumpPtx = false,
bool usePtxas = true, StringRef ptxasParams = {});

StringRef getArgument() const override { return "gpu-to-cubin"; }
StringRef getDescription() const override {
return "Lower GPU kernel function to CUBIN binary annotations";
return "Lower GPU kernel function to CUBIN binary "
"annotations";
}

private:
Expand All @@ -80,9 +189,10 @@ llvm::once_flag SerializeToCubinPass::initializeBackendOnce;

SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
StringRef features, int optLevel,
bool dumpPtx) {
// No matter how this pass is constructed, ensure that the NVPTX backend
// is initialized exactly once.
bool dumpPtx, bool usePtxas,
StringRef ptxasParams) {
// No matter how this pass is constructed, ensure that
// the NVPTX backend is initialized exactly once.
llvm::call_once(initializeBackendOnce, []() {
// Initialize LLVM NVPTX backend.
LLVMInitializeNVPTXTarget();
Expand All @@ -94,7 +204,9 @@ SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
maybeSetOption(this->triple, triple);
maybeSetOption(this->chip, chip);
maybeSetOption(this->features, features);
maybeSetOption(this->ptxasParams, ptxasParams);
this->dumpPtx = dumpPtx;
this->usePtxas = usePtxas;
if (this->optLevel.getNumOccurrences() == 0)
this->optLevel.setValue(optLevel);
}
Expand All @@ -112,7 +224,8 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {

RETURN_ON_CUDA_ERROR(cuInit(0));

// Linking requires a device context.
// Linking requires a device
// context.
CUdevice device;
RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
CUcontext context;
Expand All @@ -131,9 +244,24 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {

auto kernelName = getOperation().getName().str();
if (dumpPtx) {
llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
llvm::dbgs() << isa << "\n";
llvm::errs() << "// Kernel Name : [" << kernelName << "]\n";
llvm::errs() << isa << "\n";
}

if (usePtxas) {
// Try to compile it with ptxas first.
std::string message;
FailureOr<std::string> maybeCubinImage =
compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message);
if (succeeded(maybeCubinImage)) {
return std::make_unique<std::vector<char>>(
maybeCubinImage.value().begin(), maybeCubinImage.value().end());
}
emitError(loc) << message;
return {};
}

// Fallback to JIT compilation if ptxas fails.
RETURN_ON_CUDA_ERROR(cuLinkAddData(
linkState, CUjitInputType::CU_JIT_INPUT_PTX,
const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
Expand All @@ -150,7 +278,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
auto result =
std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);

// This will also destroy the cubin data.
// This will also destroy the cubin data.
RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));

Expand All @@ -159,17 +287,22 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {

// Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
void mlir::registerGpuSerializeToCubinPass() {
PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
[] { return std::make_unique<SerializeToCubinPass>(); });
PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
// Initialize LLVM NVPTX backend.
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();

return std::make_unique<SerializeToCubinPass>();
});
}

std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
StringRef arch,
StringRef features,
int optLevel,
bool dumpPtx) {
return std::make_unique<SerializeToCubinPass>(triple, arch, features,
optLevel, dumpPtx);
std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(
const gpu::SerializationToCubinOptions &options) {
return std::make_unique<SerializeToCubinPass>(
options.triple, options.chip, options.features, options.optLevel,
options.dumpPtx, options.usePtxas, options.ptxasParams);
}

#else // MLIR_GPU_TO_CUBIN_PASS_ENABLE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,12 @@ void mlir::sparse_tensor::buildSparseCompiler(
// Finalize GPU code generation.
if (gpuCodegen) {
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
options.gpuTriple, options.gpuChip, options.gpuFeatures));
gpu::SerializationToCubinOptions cubinOptions;
cubinOptions.triple = options.gpuTriple;
cubinOptions.chip = options.gpuChip;
cubinOptions.features = options.gpuFeatures;
pm.addNestedPass<gpu::GPUModuleOp>(
createGpuSerializeToCubinPass(cubinOptions));
#endif
pm.addPass(createGpuToLLVMConversionPass());
}
Expand Down
8 changes: 6 additions & 2 deletions mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,12 @@ void buildGpuPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());

#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
options.cubinTriple, options.cubinChip, options.cubinFeatures));
gpu::SerializationToCubinOptions cubinOptions;
cubinOptions.triple = options.cubinTriple;
cubinOptions.chip = options.cubinChip;
cubinOptions.features = options.cubinFeatures;
pm.addNestedPass<gpu::GPUModuleOp>(
createGpuSerializeToCubinPass(cubinOptions));
#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
}

Expand Down

0 comments on commit 585cbe3

Please sign in to comment.