diff --git a/CMakeLists.txt b/CMakeLists.txt index 37021746d6..950f28078a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,10 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + add_compile_definitions(_DEBUG) +endif() + project(shadPS4) # Forcing PIE makes sure that the base address is high enough so that it doesn't clash with the PS4 memory. @@ -558,6 +562,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/frontend/structured_control_flow.h src/shader_recompiler/ir/passes/constant_propagation_pass.cpp src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp + src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp diff --git a/src/common/decoder.cpp b/src/common/decoder.cpp index 2499074197..aeaba3ca6d 100644 --- a/src/common/decoder.cpp +++ b/src/common/decoder.cpp @@ -13,6 +13,15 @@ DecoderImpl::DecoderImpl() { DecoderImpl::~DecoderImpl() = default; +std::string DecoderImpl::disassembleInst(ZydisDecodedInstruction& inst, + ZydisDecodedOperand* operands, u64 address) { + const int bufLen = 256; + char szBuffer[bufLen]; + ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible, + szBuffer, sizeof(szBuffer), address, ZYAN_NULL); + return szBuffer; +} + void DecoderImpl::printInstruction(void* code, u64 address) { ZydisDecodedInstruction instruction; ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE]; @@ -27,11 +36,8 @@ void DecoderImpl::printInstruction(void* code, u64 address) { void DecoderImpl::printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address) { - const int bufLen = 256; - char szBuffer[bufLen]; - ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible, - szBuffer, sizeof(szBuffer), address, ZYAN_NULL); - fmt::print("instruction: {}\n", szBuffer); + std::string s = disassembleInst(inst, operands, address); + fmt::print("instruction: {}\n", s); } ZyanStatus DecoderImpl::decodeInstruction(ZydisDecodedInstruction& inst, diff --git a/src/common/decoder.h b/src/common/decoder.h index 1f22195968..a5dadbf193 100644 --- a/src/common/decoder.h +++ b/src/common/decoder.h @@ -14,6 +14,8 @@ class DecoderImpl { DecoderImpl(); ~DecoderImpl(); + std::string disassembleInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, + u64 address); void printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address); void printInstruction(void* code, u64 address); ZyanStatus decodeInstruction(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, diff --git a/src/common/hash.h b/src/common/hash.h new file mode 100644 index 0000000000..d5cacedd7d --- /dev/null +++ b/src/common/hash.h @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/types.h" + +[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { + return seed ^ (hash + 0x9e3779b9 + (seed << 12) + (seed >> 4)); +} + +[[nodiscard]] inline u32 HashCombine(const u32 seed, const u32 hash) { + return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} \ No newline at end of file diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 2d48999c0b..064200d991 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/assert.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -146,9 +147,14 @@ void EmitGetGotoVariable(EmitContext&) { UNREACHABLE_MSG("Unreachable instruction"); } -Id EmitReadConst(EmitContext& ctx) { - return ctx.u32_zero_value; - UNREACHABLE_MSG("Unreachable instruction"); +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) { + u32 flatbuf_off_dw = inst->Flags(); + ASSERT(ctx.srt_flatbuf.binding >= 0); + ASSERT(flatbuf_off_dw > 0); + Id index = ctx.ConstU32(flatbuf_off_dw); + auto& buffer = ctx.srt_flatbuf; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U32[1], ptr); } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 02b98b3431..12361991a0 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -62,7 +62,7 @@ void EmitSetVectorRegister(EmitContext& ctx); void EmitSetGotoVariable(EmitContext& ctx); void EmitGetGotoVariable(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); -Id EmitReadConst(EmitContext& ctx); +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst); Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index); Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 6581a7a56e..dc404b121d 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -4,12 +4,14 @@ #include "common/assert.h" #include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" +#include "shader_recompiler/ir/passes/srt.h" #include "video_core/amdgpu/types.h" #include #include #include +#include namespace Shader::Backend::SPIRV { namespace { @@ -435,14 +437,16 @@ void EmitContext::DefinePushDataBlock() { void EmitContext::DefineBuffers() { boost::container::small_vector type_ids; - const auto define_struct = [&](Id record_array_type, bool is_instance_data) { + const auto define_struct = [&](Id record_array_type, bool is_instance_data, + std::optional explicit_name = {}) { const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) { return struct_type; } Decorate(record_array_type, spv::Decoration::ArrayStride, 4); - const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) - : fmt::format("{}_cbuf_block_f32", stage); + auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) + : fmt::format("{}_cbuf_block_f32", stage); + name = explicit_name.value_or(name); Name(struct_type, name); Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, 0, "data"); @@ -451,6 +455,29 @@ void EmitContext::DefineBuffers() { return struct_type; }; + if (info.has_readconst) { + const Id data_type = U32[1]; + const auto storage_class = spv::StorageClass::Uniform; + const Id pointer_type = TypePointer(storage_class, data_type); + const Id record_array_type{ + TypeArray(U32[1], ConstU32(static_cast(info.flattened_ud_buf.size())))}; + + const Id struct_type{define_struct(record_array_type, false, "srt_flatbuf_ty")}; + + const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; + const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; + Decorate(id, spv::Decoration::Binding, binding.unified++); + Decorate(id, spv::Decoration::DescriptorSet, 0U); + Name(id, "srt_flatbuf_ubo"); + + srt_flatbuf = { + .id = id, + .binding = binding.buffer++, + .pointer_type = pointer_type, + }; + interfaces.push_back(id); + } + for (const auto& desc : info.buffers) { const auto sharp = desc.GetSharp(info); const bool is_storage = desc.IsStorage(sharp); @@ -471,7 +498,7 @@ void EmitContext::DefineBuffers() { if (is_storage && !desc.is_written) { Decorate(id, spv::Decoration::NonWritable); } - Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base)); + Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sharp_idx)); buffers.push_back({ .id = id, @@ -495,7 +522,7 @@ void EmitContext::DefineTextureBuffers() { const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base)); + Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx)); texture_buffers.push_back({ .id = id, .binding = binding.buffer++, @@ -582,7 +609,7 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { } Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { - const auto image = ctx.info.ReadUd(desc.sgpr_base, desc.dword_offset); + const auto image = ctx.info.ReadUdSharp(desc.sharp_idx); const auto format = desc.is_atomic ? GetFormat(image) : spv::ImageFormat::Unknown; const u32 sampled = desc.is_storage ? 2 : 1; switch (desc.type) { @@ -618,8 +645,7 @@ void EmitContext::DefineImagesAndSamplers() { const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}{}_{:02x}", stage, "img", image_desc.sgpr_base, - image_desc.dword_offset)); + Name(id, fmt::format("{}_{}{}", stage, "img", image_desc.sharp_idx)); images.push_back({ .data_types = &data_types, .id = id, @@ -643,8 +669,7 @@ void EmitContext::DefineImagesAndSamplers() { const Id id{AddGlobalVariable(sampler_pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}{}_{:02x}", stage, "samp", samp_desc.sgpr_base, - samp_desc.dword_offset)); + Name(id, fmt::format("{}_{}{}", stage, "samp", samp_desc.sharp_idx)); samplers.push_back(id); interfaces.push_back(id); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 147b4c8452..fb30a5dd63 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -228,6 +228,7 @@ class EmitContext final : public Sirit::Module { Bindings& binding; boost::container::small_vector buffers; boost::container::small_vector texture_buffers; + BufferDefinition srt_flatbuf; boost::container::small_vector images; boost::container::small_vector samplers; diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index a6f8cafd7a..89426e0805 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -10,6 +10,10 @@ static constexpr u32 SQ_SRC_LITERAL = 0xFF; void Translator::EmitScalarMemory(const GcnInst& inst) { switch (inst.opcode) { // SMRD + case Opcode::S_LOAD_DWORD: + return S_LOAD_DWORD(1, inst); + case Opcode::S_LOAD_DWORDX2: + return S_LOAD_DWORD(2, inst); case Opcode::S_LOAD_DWORDX4: return S_LOAD_DWORD(4, inst); case Opcode::S_LOAD_DWORDX8: diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index bae6681cbb..ccce31a245 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -388,7 +388,7 @@ void Translator::EmitFetch(const GcnInst& inst) { IR::VectorReg dst_reg{attrib.dest_vgpr}; // Read the V# of the attribute to figure out component number and type. - const auto buffer = info.ReadUd(attrib.sgpr_base, attrib.dword_offset); + const auto buffer = info.ReadUdReg(attrib.sgpr_base, attrib.dword_offset); for (u32 i = 0; i < 4; i++) { const IR::F32 comp = [&] { switch (buffer.GetSwizzle(i)) { @@ -418,8 +418,7 @@ void Translator::EmitFetch(const GcnInst& inst) { if (step_rate == Info::VsInput::OverStepRate0 || step_rate == Info::VsInput::OverStepRate1) { info.buffers.push_back({ - .sgpr_base = attrib.sgpr_base, - .dword_offset = attrib.dword_offset, + .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), .used_types = IR::Type::F32, .is_instance_data = true, }); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index e727c8a08d..610090da06 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #pragma once +#include #include #include #include @@ -10,11 +11,13 @@ #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/frontend/copy_shader.h" #include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/passes/srt.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/ir/type.h" #include "shader_recompiler/params.h" #include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/resource.h" +#include "xbyak/xbyak.h" namespace Shader { @@ -36,8 +39,7 @@ constexpr u32 NUM_TEXTURE_TYPES = 7; struct Info; struct BufferResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; IR::Type used_types; AmdGpu::Buffer inline_cbuf; bool is_gds_buffer{}; @@ -53,8 +55,7 @@ struct BufferResource { using BufferResourceList = boost::container::small_vector; struct TextureBufferResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::NumberFormat nfmt; bool is_written{}; @@ -63,8 +64,7 @@ struct TextureBufferResource { using TextureBufferResourceList = boost::container::small_vector; struct ImageResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::ImageType type; AmdGpu::NumberFormat nfmt; bool is_storage{}; @@ -77,8 +77,7 @@ struct ImageResource { using ImageResourceList = boost::container::small_vector; struct SamplerResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::Sampler inline_sampler{}; u32 associated_image : 4; u32 disable_aniso : 1; @@ -180,6 +179,9 @@ struct Info { ImageResourceList images; SamplerResourceList samplers; + PersistentSrtInfo srt_info; + std::vector flattened_ud_buf; + std::span user_data; Stage stage; @@ -199,14 +201,23 @@ struct Info { bool uses_fp64{}; bool uses_step_rates{}; bool translation_failed{}; // indicates that shader has unsupported instructions + bool has_readconst{}; u8 mrt_mask{0u}; + // just for logging, TODO delete + size_t perm_idx; + explicit Info(Stage stage_, ShaderParams params) : stage{stage_}, pgm_hash{params.hash}, pgm_base{params.Base()}, user_data{params.user_data} {} template - T ReadUd(u32 ptr_index, u32 dword_offset) const noexcept { + inline T ReadUdSharp(u32 sharp_idx) const noexcept { + return *reinterpret_cast(&flattened_ud_buf[sharp_idx]); + } + + template + T ReadUdReg(u32 ptr_index, u32 dword_offset) const noexcept { T data; const u32* base = user_data.data(); if (ptr_index != IR::NumScalarRegs) { @@ -228,7 +239,8 @@ struct Info { } void AddBindings(Backend::Bindings& bnd) const { - const auto total_buffers = buffers.size() + texture_buffers.size(); + const auto total_buffers = + buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0); bnd.buffer += total_buffers; bnd.unified += total_buffers + images.size() + samplers.size(); bnd.user_data += ud_mask.NumRegs(); @@ -245,22 +257,33 @@ struct Info { } return {vertex_offset, instance_offset}; } + + void RefreshFlatBuf() { + flattened_ud_buf.resize(srt_info.flattened_bufsize_dw); + ASSERT(user_data.size() <= NumUserDataRegs); + std::memcpy(flattened_ud_buf.data(), user_data.data(), user_data.size_bytes()); + // Run the JIT program to walk the SRT and write the leaves to a flat buffer + PFN_SrtWalker pfn = srt_info.walker.getCode(); + if (pfn) { + pfn(user_data.data(), flattened_ud_buf.data()); + } + } }; constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { - return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); + return inline_cbuf ? inline_cbuf : info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept { - return info.ReadUd(sgpr_base, dword_offset); + return info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept { - return info.ReadUd(sgpr_base, dword_offset); + return info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept { - return inline_sampler ? inline_sampler : info.ReadUd(sgpr_base, dword_offset); + return inline_sampler ? inline_sampler : info.ReadUdSharp(sharp_idx); } } // namespace Shader diff --git a/src/shader_recompiler/ir/basic_block.cpp b/src/shader_recompiler/ir/basic_block.cpp index 60ba0647ae..426acb2b82 100644 --- a/src/shader_recompiler/ir/basic_block.cpp +++ b/src/shader_recompiler/ir/basic_block.cpp @@ -118,6 +118,10 @@ std::string DumpBlock(const Block& block, const std::map& } else { ret += fmt::format(" {}", op); // '%00000 = ' -> 1 + 5 + 3 = 9 spaces } + + if (op == Opcode::ReadConst) { + ret += fmt::format(" (flags={}) ", inst.Flags()); + } const size_t arg_count{inst.NumArgs()}; for (size_t arg_index = 0; arg_index < arg_count; ++arg_index) { const Value arg{inst.Arg(arg_index)}; diff --git a/src/shader_recompiler/ir/breadth_first_search.h b/src/shader_recompiler/ir/breadth_first_search.h index 0156303f08..b042ae3d68 100644 --- a/src/shader_recompiler/ir/breadth_first_search.h +++ b/src/shader_recompiler/ir/breadth_first_search.h @@ -11,34 +11,37 @@ namespace Shader::IR { -template -auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t { +// Use typename Instruction so the function can be used to return either const or mutable +// Insts depending on the context. +template +auto BreadthFirstSearch(Instruction* inst, Pred&& pred) + -> std::invoke_result_t { // Most often case the instruction is the desired already. - if (const std::optional result = pred(inst)) { + if (std::optional result = pred(inst)) { return result; } // Breadth-first search visiting the right most arguments first - boost::container::small_vector visited; - std::queue queue; + boost::container::small_vector visited; + std::queue queue; queue.push(inst); while (!queue.empty()) { // Pop one instruction from the queue - const Inst* const inst{queue.front()}; + Instruction* inst{queue.front()}; queue.pop(); - if (const std::optional result = pred(inst)) { + if (std::optional result = pred(inst)) { // This is the instruction we were looking for return result; } // Visit the right most arguments first for (size_t arg = inst->NumArgs(); arg--;) { - const Value arg_value{inst->Arg(arg)}; + Value arg_value{inst->Arg(arg)}; if (arg_value.IsImmediate()) { continue; } // Queue instruction if it hasn't been visited - const Inst* const arg_inst{arg_value.InstRecursive()}; + Instruction* arg_inst{arg_value.InstRecursive()}; if (std::ranges::find(visited, arg_inst) == visited.end()) { visited.push_back(arg_inst); queue.push(arg_inst); @@ -59,4 +62,13 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred) return BreadthFirstSearch(value.InstRecursive(), pred); } +template +auto BreadthFirstSearch(Value value, Pred&& pred) -> std::invoke_result_t { + if (value.IsImmediate()) { + // Nothing to do with immediates + return std::nullopt; + } + return BreadthFirstSearch(value.InstRecursive(), pred); +} + } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp new file mode 100644 index 0000000000..edea69565d --- /dev/null +++ b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp @@ -0,0 +1,257 @@ + +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include "common/config.h" +#include "common/io_file.h" +#include "common/logging/log.h" +#include "common/path_util.h" +#include "common/singleton.h" +#include "shader_recompiler/info.h" +#include "shader_recompiler/ir/breadth_first_search.h" +#include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/passes/srt.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/ir/reg.h" +#include "shader_recompiler/ir/srt_gvn_table.h" +#include "shader_recompiler/ir/value.h" +#include "src/common/arch.h" +#include "src/common/decoder.h" + +using namespace Xbyak::util; + +// TODO make sure no problems with identity and Insts being used in maps + +static void DumpSrtProgram(const Shader::Info& info, const u8* code, size_t codesize) { +#ifdef ARCH_X86_64 + using namespace Common::FS; + + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = + fmt::format("{}_{:#018x}_{}.srtprogram.txt", info.stage, info.pgm_hash, info.perm_idx); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write, FileType::TextFile}; + + u64 address = reinterpret_cast(code); + u64 code_end = address + codesize; + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + ZyanStatus status = ZYAN_STATUS_SUCCESS; + while (address < code_end && ZYAN_SUCCESS(Common::Decoder::Instance()->decodeInstruction( + instruction, operands, reinterpret_cast(address)))) { + std::string s = + Common::Decoder::Instance()->disassembleInst(instruction, operands, address); + s += "\n"; + file.WriteString(s); + address += instruction.length; + } +#endif +} + +namespace { +class SrtCodegen : public Xbyak::CodeGenerator { +public: + SrtCodegen() : CodeGenerator(1_MB) {} +}; + +using namespace Shader; + +struct PassInfo { + // map offset to inst + using PtrUserList = boost::container::flat_map; + + Optimization::SrtGvnTable gvn_table; + // keys are GetUserData or ReadConst instructions that are used as pointers + std::unordered_map pointer_uses; + // GetUserData instructions corresponding to sgpr_base of SRT roots + boost::container::small_flat_map srt_roots; + + // pick a single inst for a given value number + std::unordered_map vn_to_inst; + + // Bumped during codegen to assign offsets to readconsts + u32 dst_off_dw; + + PtrUserList* GetUsesAsPointer(IR::Inst* inst) { + auto it = pointer_uses.find(inst); + if (it != pointer_uses.end()) { + return &it->second; + } + return nullptr; + } + + // Return a single instruction that this instruction is identical to, according + // to value number + // The "original" is arbitrary. Here it's the first instruction found for a given value number + IR::Inst* DeduplicateInstruction(IR::Inst* inst) { + auto it = vn_to_inst.try_emplace(gvn_table.GetValueNumber(inst), inst); + return it.first->second; + } +}; +} // namespace + +namespace Shader::Optimization { + +namespace { + +static inline void PushPtr(Xbyak::CodeGenerator& c, u32 off_dw) { + c.push(rdi); + c.mov(rdi, ptr[rdi + (off_dw << 2)]); + c.mov(r10, 0xFFFFFFFFFFFFULL); + c.and_(rdi, r10); +} + +static inline void PopPtr(Xbyak::CodeGenerator& c) { + c.pop(rdi); +}; + +static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info, + Xbyak::CodeGenerator& c) { + PushPtr(c, off_dw); + PassInfo::PtrUserList* use_list = pass_info.GetUsesAsPointer(subtree); + ASSERT(use_list); + + // First copy all the src data from this tree level + // That way, all data that was contiguous in the guest SRT is also contiguous in the + // flattened buffer. + // TODO src and dst are contiguous. Optimize with wider loads/stores + // TODO if this subtree is dynamically indexed, don't compact it (keep it sparse) + for (auto [src_off_dw, use] : *use_list) { + c.mov(r10d, ptr[rdi + (src_off_dw << 2)]); + c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r10d); + + use->SetFlags(pass_info.dst_off_dw); + pass_info.dst_off_dw++; + } + + // Then visit any children used as pointers + for (const auto [src_off_dw, use] : *use_list) { + if (pass_info.GetUsesAsPointer(use)) { + VisitPointer(src_off_dw, use, pass_info, c); + } + } + + PopPtr(c); +} + +static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { + Xbyak::CodeGenerator& c = *Common::Singleton::Instance(); + + if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) { + return; + } + + pass_info.dst_off_dw = NumUserDataRegs; + + // Special case for V# step rate buffers in fetch shader + for (const auto [sgpr_base, dword_offset, num_dwords] : info.srt_info.srt_reservations) { + // get pointer to V# + c.mov(r10d, ptr[rdi + (sgpr_base << 2)]); + + u32 src_off = dword_offset << 2; + + for (auto j = 0; j < num_dwords; j++) { + c.mov(r11d, ptr[r10d + src_off]); + c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d); + + src_off += 4; + ++pass_info.dst_off_dw; + } + } + + ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw); + + for (const auto& [sgpr_base, root] : pass_info.srt_roots) { + VisitPointer(static_cast(sgpr_base), root, pass_info, c); + } + + c.ret(); + c.ready(); + + size_t codesize = c.getSize(); + info.srt_info.walker = SmallCodeArray(c.getCode(), codesize); + + if (Config::dumpShaders()) { + DumpSrtProgram(info, c.getCode(), codesize); + } + + c.reset(); + + info.srt_info.flattened_bufsize_dw = pass_info.dst_off_dw; +} + +}; // namespace + +void FlattenExtendedUserdataPass(IR::Program& program) { + Shader::Info& info = program.info; + PassInfo pass_info; + + // traverse at end and assign offsets to duplicate readconsts, using + // vn_to_inst as the source + boost::container::small_vector all_readconsts; + + for (auto r_it = program.post_order_blocks.rbegin(); r_it != program.post_order_blocks.rend(); + r_it++) { + IR::Block* block = *r_it; + for (IR::Inst& inst : *block) { + if (inst.GetOpcode() == IR::Opcode::ReadConst) { + if (!inst.Arg(1).IsImmediate()) { + LOG_WARNING(Render_Recompiler, "ReadConst has non-immediate offset"); + continue; + } + + all_readconsts.push_back(&inst); + if (pass_info.DeduplicateInstruction(&inst) != &inst) { + // This is a duplicate of a readconst we've already visited + continue; + } + + IR::Inst* ptr_composite = inst.Arg(0).InstRecursive(); + + const auto pred = [](IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + auto base0 = IR::BreadthFirstSearch(ptr_composite->Arg(0), pred); + auto base1 = IR::BreadthFirstSearch(ptr_composite->Arg(1), pred); + ASSERT_MSG(base0 && base1 && "ReadConst not from constant memory"); + + IR::Inst* ptr_lo = base0.value(); + ptr_lo = pass_info.DeduplicateInstruction(ptr_lo); + + auto ptr_uses_kv = + pass_info.pointer_uses.try_emplace(ptr_lo, PassInfo::PtrUserList{}); + PassInfo::PtrUserList& user_list = ptr_uses_kv.first->second; + + user_list[inst.Arg(1).U32()] = &inst; + + if (ptr_lo->GetOpcode() == IR::Opcode::GetUserData) { + IR::ScalarReg ud_reg = ptr_lo->Arg(0).ScalarReg(); + pass_info.srt_roots[ud_reg] = ptr_lo; + } + } + } + } + + GenerateSrtProgram(info, pass_info); + + // Assign offsets to duplicate readconsts + for (IR::Inst* readconst : all_readconsts) { + ASSERT(pass_info.vn_to_inst.contains(pass_info.gvn_table.GetValueNumber(readconst))); + IR::Inst* original = pass_info.DeduplicateInstruction(readconst); + readconst->SetFlags(original->Flags()); + } + + info.RefreshFlatBuf(); +} + +} // namespace Shader::Optimization \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index e6e389d15d..7bd47992c8 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -12,6 +12,7 @@ void SsaRewritePass(IR::BlockList& program); void IdentityRemovalPass(IR::BlockList& program); void DeadCodeEliminationPass(IR::Program& program); void ConstantPropagationPass(IR::BlockList& program); +void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerSharedMemToRegisters(IR::Program& program); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 0d91baddad..d5b5c5c999 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -13,12 +13,7 @@ namespace Shader::Optimization { namespace { -struct SharpLocation { - u32 sgpr_base; - u32 dword_offset; - - auto operator<=>(const SharpLocation&) const = default; -}; +using SharpLocation = u32; bool IsBufferAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { @@ -155,9 +150,7 @@ class Descriptors { if (desc.is_gds_buffer && existing.is_gds_buffer) { return true; } - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset && - desc.inline_cbuf == existing.inline_cbuf; + return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; buffer.used_types |= desc.used_types; @@ -167,8 +160,7 @@ class Descriptors { u32 Add(const TextureBufferResource& desc) { const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; auto& buffer = texture_buffer_resources[index]; buffer.is_written |= desc.is_written; @@ -177,8 +169,7 @@ class Descriptors { u32 Add(const ImageResource& desc) { const u32 index{Add(image_resources, desc, [&desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; auto& image = image_resources[index]; image.is_storage |= desc.is_storage; @@ -187,8 +178,7 @@ class Descriptors { u32 Add(const SamplerResource& desc) { const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; return index; } @@ -259,48 +249,25 @@ std::pair TryDisableAnisoLod0(const IR::Inst* inst) { return {prod2, true}; } -SharpLocation TrackSharp(const IR::Inst* inst) { +SharpLocation TrackSharp(const IR::Inst* inst, const Shader::Info& info) { // Search until we find a potential sharp source. - const auto pred0 = [](const IR::Inst* inst) -> std::optional { + const auto pred = [](const IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetUserData || inst->GetOpcode() == IR::Opcode::ReadConst) { return inst; } return std::nullopt; }; - const auto result = IR::BreadthFirstSearch(inst, pred0); + const auto result = IR::BreadthFirstSearch(inst, pred); ASSERT_MSG(result, "Unable to track sharp source"); inst = result.value(); - // If its from user data not much else to do. if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return SharpLocation{ - .sgpr_base = u32(IR::ScalarReg::Max), - .dword_offset = u32(inst->Arg(0).ScalarReg()), - }; + return static_cast(inst->Arg(0).ScalarReg()); + } else { + ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, + "Sharp load not from constant memory"); + return inst->Flags(); } - ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory"); - - // Retrieve offset from base. - const u32 dword_offset = inst->Arg(1).U32(); - const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); - - // Retrieve SGPR pair that holds sbase - const auto pred1 = [](const IR::Inst* inst) -> std::optional { - ASSERT(inst->GetOpcode() != IR::Opcode::ReadConst); - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst->Arg(0).ScalarReg(); - } - return std::nullopt; - }; - const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1); - const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1); - ASSERT_MSG(base0 && base1, "Nested resource loads not supported"); - - // Return retrieved location. - return SharpLocation{ - .sgpr_base = u32(base0.value()), - .dword_offset = dword_offset, - }; } s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, @@ -327,8 +294,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, cbuf = std::bit_cast(buffer); // Assign a binding to this sharp. return descriptors.Add(BufferResource{ - .sgpr_base = std::numeric_limits::max(), - .dword_offset = 0, + .sharp_idx = std::numeric_limits::max(), .used_types = BufferDataType(inst, cbuf.GetNumberFmt()), .inline_cbuf = cbuf, }); @@ -341,11 +307,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) { IR::Inst* handle = inst.Arg(0).InstRecursive(); IR::Inst* producer = handle->Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const auto sharp = TrackSharp(producer, info); + buffer = info.ReadUdSharp(sharp); binding = descriptors.Add(BufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, + .sharp_idx = sharp, .used_types = BufferDataType(inst, buffer.GetNumberFmt()), .is_written = IsBufferStore(inst), }); @@ -404,11 +369,10 @@ void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { const IR::Inst* handle = inst.Arg(0).InstRecursive(); const IR::Inst* producer = handle->Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const auto sharp = TrackSharp(producer, info); + const auto buffer = info.ReadUdSharp(sharp); const s32 binding = descriptors.Add(TextureBufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, + .sharp_idx = sharp, .nfmt = buffer.GetNumberFmt(), .is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, }); @@ -456,18 +420,16 @@ void PatchImageSampleInstruction(IR::Block& block, IR::Inst& inst, Info& info, if (handle.IsImmediate()) { LOG_WARNING(Render_Vulkan, "Inline sampler detected"); return descriptors.Add(SamplerResource{ - .sgpr_base = std::numeric_limits::max(), - .dword_offset = 0, + .sharp_idx = std::numeric_limits::max(), .inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()}, }); } // Normal sampler resource. const auto ssharp_handle = handle.InstRecursive(); const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle); - const auto ssharp = TrackSharp(ssharp_ud); + const auto ssharp = TrackSharp(ssharp_ud, info); return descriptors.Add(SamplerResource{ - .sgpr_base = ssharp.sgpr_base, - .dword_offset = ssharp.dword_offset, + .sharp_idx = ssharp, .associated_image = image_binding, .disable_aniso = disable_aniso, }); @@ -647,9 +609,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer; // Read image sharp. - const auto tsharp = TrackSharp(tsharp_handle); + const auto tsharp = TrackSharp(tsharp_handle, info); const auto inst_info = inst.Flags(); - auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); + auto image = info.ReadUdSharp(tsharp); if (!image.Valid()) { LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!"); image = AmdGpu::Image::Null(); @@ -658,8 +620,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const bool is_storage = IsImageStorageInstruction(inst); const auto type = image.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray : image.GetType(); u32 image_binding = descriptors.Add(ImageResource{ - .sgpr_base = tsharp.sgpr_base, - .dword_offset = tsharp.dword_offset, + .sharp_idx = tsharp, .type = type, .nfmt = image.GetNumberFmt(), .is_storage = is_storage, @@ -763,6 +724,7 @@ void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info, void ResourceTrackingPass(IR::Program& program) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; + Descriptors descriptors{info}; for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index e995852d51..8b93d72e37 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -63,6 +63,9 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::LaneId: info.uses_lane_id = true; break; + case IR::Opcode::ReadConst: + info.has_readconst = true; + break; default: break; } diff --git a/src/shader_recompiler/ir/passes/srt.h b/src/shader_recompiler/ir/passes/srt.h new file mode 100644 index 0000000000..95e8902a5c --- /dev/null +++ b/src/shader_recompiler/ir/passes/srt.h @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include "common/alignment.h" +#include "common/assert.h" +#include "common/types.h" +#include "xbyak/xbyak.h" + +namespace Shader { + +using PFN_SrtWalker = void PS4_SYSV_ABI (*)(const u32* /*user_data*/, u32* /*flat_dst*/); + +// Utility for copying a simple relocatable function from a Xbyak code generator to manage memory +// separately +class SmallCodeArray { +public: + SmallCodeArray() : bufsize(0), codebuf(nullptr) {} + SmallCodeArray& operator=(SmallCodeArray&& other) = default; + SmallCodeArray(SmallCodeArray&& other) = default; + + SmallCodeArray& operator=(const SmallCodeArray& other) { + *this = SmallCodeArray(reinterpret_cast(codebuf.get()), bufsize); + return *this; + } + SmallCodeArray(const SmallCodeArray& other) { + *this = other; + }; + + SmallCodeArray(const u8* code, size_t codesize) : SmallCodeArray() { + size_t pagesize = Xbyak::inner::getPageSize(); + bufsize = Common::AlignUp(codesize, pagesize); + if (bufsize > 0) { + auto fn = reinterpret_cast(boost::alignment::aligned_alloc(pagesize, bufsize)); + ASSERT(fn); + codebuf = aligned_unique_ptr(fn); + memcpy(codebuf.get(), code, codesize); + Xbyak::CodeArray::protect(codebuf.get(), bufsize, Xbyak::CodeArray::PROTECT_RE); + } + } + + ~SmallCodeArray() { + if (bufsize > 0) { + Xbyak::CodeArray::protect(codebuf.get(), bufsize, Xbyak::CodeArray::PROTECT_RW); + } + } + + template + F getCode() const { + return reinterpret_cast(codebuf.get()); + } + +private: + using aligned_unique_ptr = std::unique_ptr; + + size_t bufsize; + aligned_unique_ptr codebuf; +}; + +struct PersistentSrtInfo { + // Special case when fetch shader uses step rates. + struct SrtSharpReservation { + u32 sgpr_base; + u32 dword_offset; + u32 num_dwords; + }; + + SmallCodeArray walker; + boost::container::small_vector srt_reservations; + u32 flattened_bufsize_dw = 16; // NumUserDataRegs + + // Special case for fetch shaders because we don't generate IR to read from step rate buffers, + // so we won't see usage with GetUserData/ReadConst. + // Reserve space in the flattened buffer for a sharp ahead of time + u32 ReserveSharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) { + u32 rv = flattened_bufsize_dw; + srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords); + flattened_bufsize_dw += num_dwords; + return rv; + } +}; + +} // namespace Shader \ No newline at end of file diff --git a/src/shader_recompiler/ir/srt_gvn_table.h b/src/shader_recompiler/ir/srt_gvn_table.h new file mode 100644 index 0000000000..232ee6152d --- /dev/null +++ b/src/shader_recompiler/ir/srt_gvn_table.h @@ -0,0 +1,157 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include "common/assert.h" +#include "common/hash.h" +#include "common/types.h" +#include "shader_recompiler/ir/breadth_first_search.h" +#include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/value.h" + +namespace Shader::Optimization { + +// Does global value numbering on a subset of instructions that are used +// for loads from shader resource tables. +// Inspiration from spirv-opt + +class SrtGvnTable { +public: + using ValueNumberTable = std::unordered_map; + using ValueNum = u32; + + SrtGvnTable() : value_numbers(), next_num(0) {} + + u32 GetValueNumber(IR::Inst* inst) { + return GetValueNumber(IR::Value{inst}); + } + + u32 GetValueNumber(IR::Value v) { + v = v.Resolve(); + if (auto it = value_numbers.find(v); it != value_numbers.end()) { + return it->second; + } + if (auto inst = v.TryInstRecursive()) { + return ComputeInstValueNumber(inst); + } + return NextValueNumber(v); + } + +private: + u32 ComputeInstValueNumber(IR::Inst* inst) { + ASSERT(!value_numbers.contains( + IR::Value(inst))); // Should always be checking before calling this function + + if (inst->MayHaveSideEffects()) { + return NextValueNumber(IR::Value(inst)); + } + + u32 vn; + + switch (inst->GetOpcode()) { + case IR::Opcode::Phi: { + // hack to get to parity with main + // Need to fix ssa_rewrite pass to remove certain phis + std::optional source = TryRemoveTrivialPhi(inst); + if (!source) { + const auto pred = [](IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::CompositeConstructU32x2 || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + source = IR::BreadthFirstSearch(inst, pred).transform([](auto inst) { + return IR::Value{inst}; + }); + ASSERT(source); + } + vn = GetValueNumber(source.value()); + value_numbers[IR::Value(inst)] = vn; + break; + } + case IR::Opcode::GetUserData: + case IR::Opcode::CompositeConstructU32x2: + case IR::Opcode::ReadConst: { + InstVector iv = MakeInstVector(inst); + if (auto it = iv_to_vn.find(iv); it != iv_to_vn.end()) { + vn = it->second; + value_numbers[IR::Value(inst)] = vn; + } else { + vn = NextValueNumber(IR::Value(inst)); + iv_to_vn.emplace(std::move(iv), vn); + } + break; + } + default: + vn = NextValueNumber(IR::Value(inst)); + break; + } + + return vn; + } + + u32 NextValueNumber(IR::Value v) { + u32 rv = next_num++; + value_numbers[v] = rv; + return rv; + } + + ValueNumberTable value_numbers; + u32 next_num; + + using InstVector = boost::container::small_vector; + + InstVector MakeInstVector(IR::Inst* inst) { + ASSERT(inst->GetOpcode() != IR::Opcode::Identity); + InstVector iv; + iv.reserve(2 + inst->NumArgs()); + iv.push_back(static_cast(inst->GetOpcode())); + iv.push_back(inst->Flags()); + for (auto i = 0; i < inst->NumArgs(); i++) { + iv.push_back(GetValueNumber(inst->Arg(i))); + } + return iv; + } + + // Temp workaround for something like this: + // [0000555558a5baf8] %297 = Phi [ %24, {Block $1} ], [ %297, {Block $5} ] (uses: 4) + // [0000555558a4e038] %305 = CompositeConstructU32x2 %297, %296 (uses: 4) + // [0000555558a4e0a8] %306 = ReadConst %305, #0 (uses: 2) + // Should probably be fixed in ssa_rewrite + std::optional TryRemoveTrivialPhi(IR::Inst* phi) { + IR::Value single_source{}; + + for (auto i = 0; i < phi->NumArgs(); i++) { + IR::Value v = phi->Arg(i).Resolve(); + if (v == IR::Value(phi)) { + continue; + } + if (!single_source.IsEmpty() && single_source != v) { + return std::nullopt; + } + single_source = v; + } + + ASSERT(!single_source.IsEmpty()); + phi->ReplaceUsesWith(single_source); + return single_source; + } + + struct HashInstVector { + size_t operator()(const InstVector& iv) const { + u32 h = 0; + for (auto vn : iv) { + h = HashCombine(vn, h); + } + return h; + } + }; + + std::unordered_map iv_to_vn; +}; + +} // namespace Shader::Optimization \ No newline at end of file diff --git a/src/shader_recompiler/ir/value.cpp b/src/shader_recompiler/ir/value.cpp index cf7a70f768..889e99556d 100644 --- a/src/shader_recompiler/ir/value.cpp +++ b/src/shader_recompiler/ir/value.cpp @@ -1,7 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include +#include +#include +#include "common/hash.h" #include "shader_recompiler/ir/value.h" namespace Shader::IR { @@ -97,3 +99,52 @@ bool Value::operator!=(const Value& other) const { } } // namespace Shader::IR + +namespace std { +std::size_t hash::operator()(const Shader::IR::Value& v) const { + using namespace Shader::IR; + + u64 h = HashCombine(static_cast(v.type), 0ULL); + + switch (v.type) { + case Type::Void: + return h; + case Type::Opaque: + return reinterpret_cast(v.InstRecursive()); + case Type::ScalarReg: + return HashCombine(static_cast(v.sreg), h); + case Type::VectorReg: + return HashCombine(static_cast(v.vreg), h); + case Type::Attribute: + return HashCombine(static_cast(v.attribute), h); + case Type::U1: + return HashCombine(static_cast(v.attribute), h); + case Type::U8: + return HashCombine(static_cast(v.imm_u8), h); + case Type::U16: + case Type::F16: + return HashCombine(static_cast(v.imm_u16), h); + case Type::U32: + case Type::F32: + return HashCombine(static_cast(v.imm_u32), h); + case Type::U64: + case Type::F64: + return HashCombine(static_cast(v.imm_u64), h); + case Type::U32x2: + case Type::U32x3: + case Type::U32x4: + case Type::F16x2: + case Type::F16x3: + case Type::F16x4: + case Type::F32x2: + case Type::F32x3: + case Type::F32x4: + case Type::F64x2: + case Type::F64x3: + case Type::F64x4: + default: + break; + } + UNREACHABLE_MSG("Invalid type {}", v.type); +} +} // namespace std diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index a282b91686..7e46747b90 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -29,6 +29,7 @@ class Value { public: Value() noexcept = default; explicit Value(IR::Inst* value) noexcept; + explicit Value(const IR::Inst* value) noexcept; explicit Value(IR::ScalarReg reg) noexcept; explicit Value(IR::VectorReg reg) noexcept; explicit Value(IR::Attribute value) noexcept; @@ -82,6 +83,8 @@ class Value { f64 imm_f64; const char* string_literal; }; + + friend class std::hash; }; static_assert(static_cast(IR::Type::Void) == 0, "memset relies on IR::Type being zero"); static_assert(std::is_trivially_copyable_v); @@ -364,3 +367,10 @@ inline const char* Value::StringLiteral() const { } } // namespace Shader::IR + +namespace std { +template <> +struct hash { + std::size_t operator()(const Shader::IR::Value& v) const; +}; +} // namespace std \ No newline at end of file diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index e13e5d0095..1de2fedb6c 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/config.h" +#include "common/io_file.h" +#include "common/path_util.h" #include "shader_recompiler/frontend/control_flow_graph.h" #include "shader_recompiler/frontend/decode.h" #include "shader_recompiler/frontend/structured_control_flow.h" @@ -51,6 +54,28 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Common::ObjectPool gcn_block_pool{64}; Gcn::CFG cfg{gcn_block_pool, program.ins_list}; + bool dump_ir = false; + bool extra_id_removal = true; // TODO remove all this stuff + if (true /*info.pgm_hash == 0x6fd3463f*/) { + dump_ir = true; + } + auto dumpMatchingIR = [&](std::string phase) { + if (dump_ir) { + if (Config::dumpShaders()) { + std::string s = IR::DumpProgram(program); + using namespace Common::FS; + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = fmt::format("{}_{:#018x}_{}.{}.ir.txt", info.stage, + info.pgm_hash, info.perm_idx, phase); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; + file.WriteString(s); + } + } + }; + // Structurize control flow graph and create program. program.syntax_list = Shader::Gcn::BuildASL(pools.inst_pool, pools.block_pool, cfg, program.info, runtime_info, profile); @@ -58,16 +83,28 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front()); // Run optimization passes + dumpMatchingIR("pre_ssa"); Shader::Optimization::SsaRewritePass(program.post_order_blocks); + dumpMatchingIR("pre_const_prop"); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); if (program.info.stage != Stage::Compute) { Shader::Optimization::LowerSharedMemToRegisters(program); } Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage); + dumpMatchingIR("pre_hoist_pre_id"); + // Shader::Optimization::IdentityRemovalPass(program.blocks); // temp + if (extra_id_removal) { + Shader::Optimization::IdentityRemovalPass(program.blocks); // temp + } + dumpMatchingIR("pre_flatten"); + Shader::Optimization::FlattenExtendedUserdataPass(program); + dumpMatchingIR("pre_resource_tracking"); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::IdentityRemovalPass(program.blocks); + dumpMatchingIR("pre_dce"); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::CollectShaderInfoPass(program); + dumpMatchingIR("final"); return program; } diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 0a3a696bc2..c25c611e44 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -8,6 +8,7 @@ #include "common/types.h" #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/info.h" +#include "shader_recompiler/ir/passes/srt.h" namespace Shader { @@ -52,6 +53,9 @@ struct StageSpecialization { Backend::Bindings start_) : info{&info_}, runtime_info{runtime_info_}, start{start_} { u32 binding{}; + if (info->has_readconst) { + binding++; + } ForEachSharp(binding, buffers, info->buffers, [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); @@ -90,6 +94,12 @@ struct StageSpecialization { return false; } u32 binding{}; + if (info->has_readconst != other.info->has_readconst) { + return false; + } + if (info->has_readconst) { + binding++; + } for (u32 i = 0; i < buffers.size(); i++) { if (other.bitset[binding++] && buffers[i] != other.buffers[i]) { return false; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c2993f3d1c..15aae6fdd4 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -4,6 +4,7 @@ #include #include "common/alignment.h" #include "common/scope_exit.h" +#include "common/types.h" #include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" @@ -159,7 +160,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { continue; } - const auto& buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); + const auto& buffer = vs_info.ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -290,6 +291,14 @@ void BufferCache::InlineDataToGds(u32 gds_offset, u32 value) { cmdbuf.updateBuffer(gds_buffer.Handle(), gds_offset, sizeof(u32), &value); } +std::pair BufferCache::ObtainHostUBO(std::span data) { + static constexpr u64 StreamThreshold = CACHING_PAGESIZE; + ASSERT(data.size_bytes() <= StreamThreshold); + const u64 offset = stream_buffer.Copy(reinterpret_cast(data.data()), data.size_bytes(), + instance.UniformMinAlignment()); + return {&stream_buffer, offset}; +} + std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { static constexpr u64 StreamThreshold = CACHING_PAGESIZE; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 76309363a4..40804e5599 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -83,6 +83,8 @@ class BufferCache { /// Writes a value to GDS buffer. void InlineDataToGds(u32 gds_offset, u32 value); + [[nodiscard]] std::pair ObtainHostUBO(std::span data); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false); diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 37a44ddacf..31d472de4c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -26,6 +26,15 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler u32 binding{}; boost::container::small_vector bindings; + + if (info->has_readconst) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } for (const auto& buffer : info->buffers) { const auto sharp = buffer.GetSharp(*info); bindings.push_back({ @@ -123,6 +132,20 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, image_infos.clear(); info->PushUd(binding, push_data); + if (info->has_readconst) { + const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(info->flattened_ud_buf); + buffer_infos.emplace_back(vk_buffer->Handle(), offset, + info->flattened_ud_buf.size() * sizeof(u32)); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + ++binding.buffer; + } for (const auto& desc : info->buffers) { bool is_storage = true; if (desc.is_gds_buffer) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index cbc0fc5ecb..4167f94138 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -60,7 +60,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul } const auto buffer = - vs_info->ReadUd(input.sgpr_base, input.dword_offset); + vs_info->ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -327,6 +327,15 @@ void GraphicsPipeline::BuildDescSetLayout() { if (!stage) { continue; } + + if (stage->has_readconst) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = gp_stage_flags, + }); + } for (const auto& buffer : stage->buffers) { const auto sharp = buffer.GetSharp(*stage); bindings.push_back({ @@ -402,6 +411,21 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, push_data.step1 = regs.vgt_instance_step_rate_1; } stage->PushUd(binding, push_data); + + if (stage->has_readconst) { + const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(stage->flattened_ud_buf); + buffer_infos.emplace_back(vk_buffer->Handle(), offset, + stage->flattened_ud_buf.size() * sizeof(u32)); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + ++binding.buffer; + } for (const auto& buffer : stage->buffers) { const auto vsharp = buffer.GetSharp(*stage); const bool is_storage = buffer.IsStorage(vsharp); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index a06d82eb3c..593d139492 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -4,6 +4,7 @@ #include #include "common/config.h" +#include "common/hash.h" #include "common/io_file.h" #include "common/path_util.h" #include "shader_recompiler/backend/spirv/emit_spirv.h" @@ -22,10 +23,6 @@ namespace Vulkan { using Shader::VsOutput; -[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { - return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); -} - constexpr static std::array DescriptorHeapSizes = { vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192}, vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024}, @@ -371,7 +368,7 @@ bool PipelineCache::RefreshGraphicsKey() { continue; } const auto& buffer = - vs_info->ReadUd(input.sgpr_base, input.dword_offset); + vs_info->ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -419,6 +416,7 @@ vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, perm_idx != 0 ? "(permutation)" : ""); DumpShader(code, info.pgm_hash, info.stage, perm_idx, "bin"); + info.perm_idx = perm_idx; const auto ir_program = Shader::TranslateProgram(code, pools, info, runtime_info, profile); const auto spv = Shader::Backend::SPIRV::EmitSPIRV(profile, runtime_info, ir_program, binding); DumpShader(spv, info.pgm_hash, info.stage, perm_idx, "spv"); @@ -444,7 +442,8 @@ std::tuple PipelineCache::GetProgram } Program* program = it_pgm->second; - const auto& info = program->info; + auto& info = program->info; + info.RefreshFlatBuf(); const auto spec = Shader::StageSpecialization(info, runtime_info, binding); size_t perm_idx = program->modules.size(); vk::ShaderModule module{}; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 293dfbe6a0..fe4d2788f2 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -12,6 +12,10 @@ #include "video_core/texture_cache/texture_cache.h" #include "vk_rasterizer.h" +#ifdef MemoryBarrier +#undef MemoryBarrier +#endif + namespace Vulkan { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,