Skip to content

Commit

Permalink
rpcsx-gpu: fix flip image acquire
Browse files Browse the repository at this point in the history
fix cmpx
fix cs init ordering
implement s_mulk_i32, s_abs_i32, s_cmovk_i32, s_cmov_b32 and s_cmov_b64
fix s_mul_i32
fix s_cbranch_* for cs
  • Loading branch information
DHrpcs3 committed Oct 4, 2024
1 parent 23226c9 commit 113abf2
Show file tree
Hide file tree
Showing 13 changed files with 121 additions and 155 deletions.
18 changes: 13 additions & 5 deletions rpcsx-gpu/Cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1299,21 +1299,21 @@ Cache::ImageView Cache::Tag::getImageView(const ImageKey &key, Access access) {

void Cache::Tag::readMemory(void *target, std::uint64_t address,
std::uint64_t size) {
mParent->flush(*mScheduler, address, size);
// mParent->flush(*mScheduler, address, size);
auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address);
std::memcpy(target, memoryPtr, size);
}

void Cache::Tag::writeMemory(const void *source, std::uint64_t address,
std::uint64_t size) {
mParent->flush(*mScheduler, address, size);
// mParent->invalidate(*mScheduler, address, size);
auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address);
std::memcpy(memoryPtr, source, size);
}

int Cache::Tag::compareMemory(const void *source, std::uint64_t address,
std::uint64_t size) {
mParent->flush(*mScheduler, address, size);
// mParent->flush(*mScheduler, address, size);
auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address);
return std::memcmp(memoryPtr, source, size);
}
Expand Down Expand Up @@ -1348,14 +1348,18 @@ void Cache::Tag::release() {
return;
}

std::vector<std::shared_ptr<Entry>> tmpResources;
while (!mStorage->mAcquiredResources.empty()) {
auto resource = std::move(mStorage->mAcquiredResources.back());
mStorage->mAcquiredResources.pop_back();
resource->flush(*this, *mScheduler, 0, ~static_cast<std::uint64_t>(0));
tmpResources.push_back(std::move(resource));
}

mScheduler->submit();
mScheduler->wait();
if (!tmpResources.empty()) {
mScheduler->submit();
mScheduler->wait();
}

mStorage->clear();
auto storageIndex = mStorage - mParent->mTagStorages;
Expand Down Expand Up @@ -1865,6 +1869,10 @@ Cache::Cache(Device *device, int vmId) : mDevice(device), mVmIm(vmId) {
}

Cache::~Cache() {
for (auto &samp : mSamplers) {
vkDestroySampler(vk::context->device, samp.second, vk::context->allocator);
}

vkDestroyDescriptorPool(vk::context->device, mDescriptorPool,
vk::context->allocator);

Expand Down
51 changes: 19 additions & 32 deletions rpcsx-gpu/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,7 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
}

bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
VkCommandBuffer commandBuffer, VkImage swapchainImage,
VkImageView swapchainImageView, VkFence fence) {
VkImage swapchainImage, VkImageView swapchainImageView) {
auto &pipe = graphicsPipes[0];
auto &scheduler = pipe.scheduler;
auto &process = processInfo[pid];
Expand Down Expand Up @@ -292,15 +291,11 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
}

// std::printf("displaying buffer %lx\n", buffer.address);
VkCommandBufferBeginInfo beginInfo{};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

vkBeginCommandBuffer(commandBuffer, &beginInfo);

auto cacheTag = getCacheTag(process.vmId, scheduler);
auto &sched = cacheTag.getScheduler();

transitionImageLayout(commandBuffer, swapchainImage,
transitionImageLayout(sched.getCommandBuffer(), swapchainImage,
VK_IMAGE_LAYOUT_UNDEFINED,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
{
Expand All @@ -310,11 +305,11 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
});

amdgpu::flip(
cacheTag, commandBuffer, vk::context->swapchainExtent, buffer.address,
cacheTag, vk::context->swapchainExtent, buffer.address,
swapchainImageView, {bufferAttr.width, bufferAttr.height}, flipType,
getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], dfmt, nfmt);

transitionImageLayout(commandBuffer, swapchainImage,
transitionImageLayout(sched.getCommandBuffer(), swapchainImage,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
{
Expand All @@ -323,58 +318,50 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
.layerCount = 1,
});

sched.submit();

auto submitCompleteTask = scheduler.createExternalSubmit();

{
vkEndCommandBuffer(commandBuffer);

VkSemaphoreSubmitInfo signalSemSubmitInfos[] = {
VkSemaphoreSubmitInfo waitSemSubmitInfos[] = {
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = vk::context->renderCompleteSemaphore,
.semaphore = vk::context->presentCompleteSemaphore,
.value = 1,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
},
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = scheduler.getSemaphoreHandle(),
.value = submitCompleteTask,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
.value = submitCompleteTask - 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
},
};

VkSemaphoreSubmitInfo waitSemSubmitInfos[] = {
VkSemaphoreSubmitInfo signalSemSubmitInfos[] = {
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = vk::context->presentCompleteSemaphore,
.semaphore = vk::context->renderCompleteSemaphore,
.value = 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
},
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = scheduler.getSemaphoreHandle(),
.value = submitCompleteTask - 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
.value = submitCompleteTask,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
},
};

VkCommandBufferSubmitInfo cmdBufferSubmitInfo{
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
.commandBuffer = commandBuffer,
};

VkSubmitInfo2 submitInfo{
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
.waitSemaphoreInfoCount = 1,
.waitSemaphoreInfoCount = 2,
.pWaitSemaphoreInfos = waitSemSubmitInfos,
.commandBufferInfoCount = 1,
.pCommandBufferInfos = &cmdBufferSubmitInfo,
.signalSemaphoreInfoCount = 2,
.pSignalSemaphoreInfos = signalSemSubmitInfos,
};

vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence);
vkQueueWaitIdle(vk::context->presentQueue);
vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, VK_NULL_HANDLE);
}

scheduler.then([=, this, cacheTag = std::move(cacheTag)] {
Expand Down
3 changes: 1 addition & 2 deletions rpcsx-gpu/Device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ struct Device {
std::uint64_t size);
bool processPipes();
bool flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
VkCommandBuffer commandBuffer, VkImage swapchainImage,
VkImageView swapchainImageView, VkFence fence);
VkImage swapchainImage, VkImageView swapchainImageView);
void mapMemory(std::int64_t pid, std::uint64_t address, std::uint64_t size,
int memoryType, int dmemIndex, int prot, std::int64_t offset);
void registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer);
Expand Down
23 changes: 14 additions & 9 deletions rpcsx-gpu/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
renderTargetInfo.extent.height = vkViewPortScissor.extent.height;
renderTargetInfo.extent.depth = 1;
renderTargetInfo.dfmt = cbColor.info.dfmt;
renderTargetInfo.nfmt = gnm::toNumericFormat(cbColor.info.nfmt, cbColor.info.dfmt);
renderTargetInfo.nfmt =
gnm::toNumericFormat(cbColor.info.nfmt, cbColor.info.dfmt);
renderTargetInfo.mipCount = 1;
renderTargetInfo.arrayLayerCount = 1;

Expand Down Expand Up @@ -423,6 +424,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,

cacheTag.buildDescriptors(descriptorSets[0]);

pipe.scheduler.submit();
pipe.scheduler.afterSubmit([cacheTag = std::move(cacheTag)] {});

auto commandBuffer = pipe.scheduler.getCommandBuffer();
Expand Down Expand Up @@ -479,11 +481,14 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
vkCmdSetStencilReference(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0);

VkCullModeFlags cullMode = VK_CULL_MODE_NONE;

if (pipe.uConfig.vgtPrimitiveType != gnm::PrimitiveType::RectList) {
if (pipe.context.paSuScModeCntl.cullBack) {
cullMode |= VK_CULL_MODE_BACK_BIT;
}
if (pipe.context.paSuScModeCntl.cullFront) {
cullMode |= VK_CULL_MODE_FRONT_BIT;
}
}

vkCmdSetCullMode(commandBuffer, cullMode);
Expand Down Expand Up @@ -512,6 +517,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,

vkCmdEndRendering(commandBuffer);
pipe.scheduler.submit();
pipe.scheduler.wait();
}

void amdgpu::dispatch(Cache &cache, Scheduler &sched,
Expand All @@ -530,14 +536,15 @@ void amdgpu::dispatch(Cache &cache, Scheduler &sched,
pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &shader.handle);
vkCmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
sched.afterSubmit([tag = std::move(tag)] {});
sched.submit();
sched.wait();
}

void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
VkExtent2D targetExtent, std::uint64_t address,
VkImageView target, VkExtent2D imageExtent, FlipType type,
TileMode tileMode, gnm::DataFormat dfmt,
gnm::NumericFormat nfmt) {
void amdgpu::flip(Cache::Tag &cacheTag, VkExtent2D targetExtent,
std::uint64_t address, VkImageView target,
VkExtent2D imageExtent, FlipType type, TileMode tileMode,
gnm::DataFormat dfmt, gnm::NumericFormat nfmt) {
ImageKey framebuffer{};
framebuffer.readAddress = address;
framebuffer.type = gnm::TextureType::Dim2D;
Expand Down Expand Up @@ -601,8 +608,7 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
.pColorAttachments = colorAttachments,
};

commandBuffer = cacheTag.getScheduler().getCommandBuffer();

auto commandBuffer = cacheTag.getScheduler().getCommandBuffer();
vkCmdBeginRendering(commandBuffer, &renderInfo);

cacheTag.getDevice()->flipPipeline.bind(cacheTag.getScheduler(), type,
Expand All @@ -613,5 +619,4 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,

vkCmdDraw(commandBuffer, 6, 1, 0, 0);
vkCmdEndRendering(commandBuffer);
cacheTag.getScheduler().submit();
}
7 changes: 3 additions & 4 deletions rpcsx-gpu/Renderer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ void dispatch(Cache &cache, Scheduler &sched,
Registers::ComputeConfig &computeConfig,
std::uint32_t groupCountX, std::uint32_t groupCountY,
std::uint32_t groupCountZ);
void flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
VkExtent2D targetExtent, std::uint64_t address, VkImageView target,
VkExtent2D imageExtent, FlipType type, TileMode tileMode,
gnm::DataFormat dfmt, gnm::NumericFormat nfmt);
void flip(Cache::Tag &cacheTag, VkExtent2D targetExtent, std::uint64_t address,
VkImageView target, VkExtent2D imageExtent, FlipType type,
TileMode tileMode, gnm::DataFormat dfmt, gnm::NumericFormat nfmt);
} // namespace amdgpu
2 changes: 1 addition & 1 deletion rpcsx-gpu/lib/gcn-shader/include/shader/gcn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ struct Environment {
std::uint8_t numThreadX;
std::uint8_t numThreadY;
std::uint8_t numThreadZ;
bool supportsBarycentric = true;
bool supportsBarycentric = false;
bool supportsInt8 = false;
bool supportsInt64Atomics = false;
bool supportsNonSemanticInfo = false;
Expand Down
39 changes: 32 additions & 7 deletions rpcsx-gpu/lib/gcn-shader/shaders/rdna.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

#define ClampInfToFltMax(x) (isinf(x) ? ((x) < 0 ? -FLT_MAX : FLT_MAX) : (x))
#define ConvertInfToZero(x) (isinf(x) ? 0.0 : (x))
#define Rsqrt(x) (1.0 / sqrt(x))
#define Rsqrt(x) (inversesqrt(x))
#define Rcp(x) (1.0 / x)

#define U32ARRAY_FETCH_BITS(ARRAY, START, BITCOUNT) ((ARRAY[(START) >> 5] >> ((START) & 31)) & ((1 << (BITCOUNT)) - 1))
Expand Down Expand Up @@ -577,10 +577,12 @@ void set_cond_thread_bit(inout uint64_t sdst, bool cond) {

void set_cond_thread_bit_exec(inout uint64_t sdst, bool cond) {
uint64_t bit = uint64_t(1) << thread_id;
if (cond && (exec & bit) != 0) {
if (cond) {
sdst |= bit;
exec |= bit;
} else {
sdst &= ~bit;
exec &= ~bit;
}
}

Expand Down Expand Up @@ -995,6 +997,23 @@ void s_cmpk_le_u32(uint32_t a, uint32_t b) { scc = a <= b; }
void s_cmpk_lt_u32(uint32_t a, uint32_t b) { scc = a < b; }
void s_cmpk_lg_u32(uint32_t a, uint32_t b) { scc = a != b; }

void s_cmovk_i32(out uint32_t sdst, uint32_t value) {
if (scc) {
sdst = value;
}
}

void s_cmov_b32(out uint32_t sdst, uint32_t value) {
if (scc) {
sdst = value;
}
}

void s_cmov_b64(out uint64_t sdst, uint64_t value) {
if (scc) {
sdst = value;
}
}

uint32_t s_not_b32(uint32_t x) {
uint32_t result = ~x;
Expand Down Expand Up @@ -1236,7 +1255,13 @@ int32_t s_ashr_i32(int32_t x, uint32_t y) { int32_t result = x >> (y & 0x1f); sc
int64_t s_ashr_i64(int64_t x, uint32_t y) { int64_t result = x >> (y & 0x3f); scc = result != 0; return result; }
uint32_t s_bfm_b32(uint32_t x, uint32_t y) { uint32_t result = ((1 << (x & 0x1f)) - 1) << (y & 0x1f); scc = result != 0; return result; }
uint64_t s_bfm_b64(uint64_t x, uint64_t y) { uint64_t result = ((uint64_t(1) << (x & 0x1f)) - 1) << (y & 0x1f); scc = result != 0; return result; }
int32_t s_mul_i32(int32_t x, int32_t y) { int32_t result = x * y; scc = result != 0; return result; }
int32_t s_mul_i32(int32_t x, int32_t y) { return x * y; }
int32_t s_mulk_i32(int32_t x, int32_t y) { return x * y; }
int32_t s_abs_i32(int32_t x) {
int32_t result = abs(x);
scc = result == 0;
return result;
}
uint32_t s_bfe_u32(uint32_t x, uint32_t y) {
uint32_t offset = y & 0x1f;
uint32_t width = (y >> 16) & 0x7f;
Expand Down Expand Up @@ -2168,10 +2193,10 @@ void s_dcache_inv() {

bool s_cbranch_scc0() { return scc == false; }
bool s_cbranch_scc1() { return scc == true; }
bool s_cbranch_vccz() { return vcc == 0; }
bool s_cbranch_vccnz() { return vcc != 0; }
bool s_cbranch_execz() { return exec == 0; }
bool s_cbranch_execnz() { return exec != 0; }
bool s_cbranch_vccz() { return (vcc & (uint64_t(1) << thread_id)) == 0; }
bool s_cbranch_vccnz() { return (vcc & (uint64_t(1) << thread_id)) != 0; }
bool s_cbranch_execz() { return (exec & (uint64_t(1) << thread_id)) == 0; }
bool s_cbranch_execnz() { return (exec & (uint64_t(1) << thread_id)) != 0; }


// DS
Expand Down
17 changes: 9 additions & 8 deletions rpcsx-gpu/lib/gcn-shader/src/GcnConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,25 +1577,26 @@ static void createInitialValues(GcnConverter &converter,
}
}

for (std::int32_t i = 0; i < 3; ++i) {
auto value = builder.createSpvCompositeExtract(loc, uintT,
localInvocationId, {{i}});
context.writeReg(loc, builder, gcn::RegId::Vgpr, i, value);
}

auto workgroupSize = builder.createSpvCompositeConstruct(
loc, uvec3T,
{{context.imm32(env.numThreadX), context.imm32(env.numThreadY),
context.imm32(env.numThreadZ)}});
auto workgroupSizeLocVar =
converter.createLocalVariable(builder, loc, workgroupSize);

builder.createValue(loc, ir::amdgpu::CS_SET_INITIAL_EXEC,
builder.createValue(loc, ir::amdgpu::CS_SET_THREAD_ID,
context.getTypeVoid(), localInvocationIdLocVar,
workgroupSizeLocVar);
builder.createValue(loc, ir::amdgpu::CS_SET_THREAD_ID,

builder.createValue(loc, ir::amdgpu::CS_SET_INITIAL_EXEC,
context.getTypeVoid(), localInvocationIdLocVar,
workgroupSizeLocVar);

for (std::int32_t i = 0; i < 3; ++i) {
auto value = builder.createSpvCompositeExtract(loc, uintT,
localInvocationId, {{i}});
context.writeReg(loc, builder, gcn::RegId::Vgpr, i, value);
}
}

context.writeReg(loc, builder, gcn::RegId::Vcc, 0, context.imm64(0));
Expand Down
Loading

0 comments on commit 113abf2

Please sign in to comment.