Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RandomX fixes and improvements #271

Merged
merged 1 commit into from
Aug 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/amd/GpuContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ struct GpuContext
Nonce(0)
#ifdef XMRIG_ALGO_RANDOMX
, gcnAsm(1)
, datasetHost(0)
, AsmProgram(nullptr)
, rx_variant(xmrig::VARIANT_AUTO)
, rx_dataset(nullptr)
, rx_scratchpads(nullptr)
, rx_hashes(nullptr)
, rx_entropy(nullptr)
Expand Down Expand Up @@ -131,11 +131,12 @@ struct GpuContext

#ifdef XMRIG_ALGO_RANDOMX
int gcnAsm;
int datasetHost;
cl_program AsmProgram;

uint8_t rx_dataset_seedhash[32];
xmrig::Variant rx_variant;
cl_mem rx_dataset;
static cl_mem rx_dataset[32];
cl_mem rx_scratchpads;
cl_mem rx_hashes;
cl_mem rx_entropy;
Expand Down
40 changes: 30 additions & 10 deletions src/amd/OclGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@

constexpr const char *kSetKernelArgErr = "Error %s when calling clSetKernelArg for kernel %d, argument %d.";

cl_mem GpuContext::rx_dataset[32] = {};


inline static const char *err_to_str(cl_int ret)
{
Expand Down Expand Up @@ -217,10 +219,20 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
#ifdef XMRIG_ALGO_RANDOMX
if (config->algorithm().algo() == xmrig::RANDOM_X) {
const size_t dataset_size = randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE;
ctx->rx_dataset = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret);
if (ret != CL_SUCCESS) {
LOG_ERR("Error %s when calling clCreateBuffer to create RandomX dataset.", err_to_str(ret));
return OCL_ERR_API;

if (!ctx->rx_dataset[ctx->deviceIdx]) {
if (!ctx->datasetHost) {
ctx->rx_dataset[ctx->deviceIdx] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret);
}
else {
randomx_dataset* dataset = Workers::getDataset(nullptr, xmrig::VARIANT_AUTO);
ctx->rx_dataset[ctx->deviceIdx] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, randomx_get_dataset_memory(dataset), &ret);
}

if (ret != CL_SUCCESS) {
LOG_ERR("Error %s when calling clCreateBuffer to create RandomX dataset.", err_to_str(ret));
return OCL_ERR_API;
}
}

ctx->rx_scratchpads = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, (xmrig::rx_select_memory(config->algorithm().variant()) + 64) * g_thd, nullptr, &ret);
Expand Down Expand Up @@ -555,7 +567,7 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
return OCL_ERR_API;
}

if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 3, sizeof(cl_mem), &ctx->rx_dataset)) != CL_SUCCESS) {
if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 3, sizeof(cl_mem), &ctx->rx_dataset[ctx->deviceIdx])) != CL_SUCCESS) {
LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 3);
return OCL_ERR_API;
}
Expand Down Expand Up @@ -617,7 +629,7 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch
// iteration is set in RXRunJob()

// randomx_run
if ((ret = OclLib::setKernelArg(ctx->rx_kernels[10], 0, sizeof(cl_mem), &ctx->rx_dataset)) != CL_SUCCESS) {
if ((ret = OclLib::setKernelArg(ctx->rx_kernels[10], 0, sizeof(cl_mem), &ctx->rx_dataset[ctx->deviceIdx])) != CL_SUCCESS) {
LOG_ERR(kSetKernelArgErr, err_to_str(ret), 10, 0);
return OCL_ERR_API;
}
Expand Down Expand Up @@ -1367,9 +1379,11 @@ size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t targ
if ((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0) || (ctx->rx_variant != variant)) {
memcpy(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash));
ctx->rx_variant = variant;
if ((ret = OclLib::enqueueWriteBuffer(ctx->CommandQueues, ctx->rx_dataset, CL_TRUE, 0, dataset_size, randomx_get_dataset_memory(dataset), 0, nullptr, nullptr)) != CL_SUCCESS) {
LOG_ERR("Error %s when calling clEnqueueWriteBuffer to fill RandomX dataset.", err_to_str(ret));
return OCL_ERR_API;
if (!ctx->datasetHost) {
if ((ret = OclLib::enqueueWriteBuffer(ctx->CommandQueues, ctx->rx_dataset[ctx->deviceIdx], CL_TRUE, 0, dataset_size, randomx_get_dataset_memory(dataset), 0, nullptr, nullptr)) != CL_SUCCESS) {
LOG_ERR("Error %s when calling clEnqueueWriteBuffer to fill RandomX dataset.", err_to_str(ret));
return OCL_ERR_API;
}
}
}

Expand Down Expand Up @@ -1600,7 +1614,13 @@ void ReleaseOpenCl(GpuContext* ctx)

#ifdef XMRIG_ALGO_RANDOMX
if (ctx->AsmProgram) OclLib::releaseProgram(ctx->AsmProgram);
if (ctx->rx_dataset) OclLib::releaseMemObject(ctx->rx_dataset);

if (ctx->rx_dataset[ctx->deviceIdx]) {
cl_mem ptr = ctx->rx_dataset[ctx->deviceIdx];
ctx->rx_dataset[ctx->deviceIdx] = nullptr;
OclLib::releaseMemObject(ptr);
}

if (ctx->rx_scratchpads) OclLib::releaseMemObject(ctx->rx_scratchpads);
if (ctx->rx_hashes) OclLib::releaseMemObject(ctx->rx_hashes);
if (ctx->rx_entropy) OclLib::releaseMemObject(ctx->rx_entropy);
Expand Down
22 changes: 11 additions & 11 deletions src/amd/opencl/RandomX/randomx_run_gfx803.asm
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,15 @@ main_loop:

v_add_u32 v22, vcc, v6, v36
v_addc_u32 v25, vcc, v20, 0, vcc
v_add_u32 v21, vcc, v22, v1
v_addc_u32 v22, vcc, v25, 0, vcc
flat_load_dwordx2 v[21:22], v[21:22]
v_or_b32 v30, v32, v13
v_and_b32 v31, v33, v19
v_or_b32 v31, v31, v14
v_or_b32 v28, v28, v15
v_and_b32 v29, v29, v19
v_or_b32 v29, v29, v16
v_add_u32 v21, vcc, v22, v1
v_addc_u32 v22, vcc, v25, 0, vcc
ds_write2_b64 v5, v[30:31], v[28:29] offset1:1
s_waitcnt lgkmcnt(0)

Expand Down Expand Up @@ -402,6 +403,13 @@ main_loop:
# call JIT code
s_swappc_b64 s[12:13], s[4:5]

# Write out group F,E registers
# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22

# store VM integer registers
v_writelane_b32 v28, s16, 0
v_writelane_b32 v29, s17, 0
Expand All @@ -420,21 +428,13 @@ main_loop:
v_writelane_b32 v28, s30, 7
v_writelane_b32 v29, s31, 7

# Write out group F,E registers
# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22

# Restore execution mask
s_mov_b64 exec, s[36:37]

# Write out VM integer registers
ds_write_b64 v17, v[28:29]

flat_load_dwordx2 v[21:22], v[21:22]
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_waitcnt lgkmcnt(0)
v_xor_b32 v21, v28, v21
v_xor_b32 v22, v29, v22
ds_read_b32 v28, v7
Expand Down
30 changes: 15 additions & 15 deletions src/amd/opencl/RandomX/randomx_run_gfx803.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,20 @@ static unsigned char randomx_run_gfx803_bin[]={
,0xd8,0x00,0x00,0x00,0x18,0x00,0x00,0xec,0xd8,0x0c,0x00,0x00,0x1a,0x7f,0x00,0x8c,0xbf,0x1b,0x33,0x32,0x2a,0x1a,0x31,0x30,0x2a,0x19,0x15,0x14,0x2a,0x18,0x2f,0x2e
,0x2a,0x56,0x14,0x14,0x26,0x56,0x2e,0x2e,0x26,0x0a,0x03,0x14,0x32,0x17,0x03,0x2e,0x32,0x02,0x15,0x34,0x32,0x1b,0x6a,0x1c,0xd1,0x03,0x01,0xa9,0x01,0x02,0x2f,0x2e
,0x32,0x18,0x6a,0x1c,0xd1,0x03,0x01,0xa9,0x01,0x00,0x00,0x54,0xdc,0x1a,0x00,0x00,0x1c,0x00,0x00,0x54,0xdc,0x17,0x00,0x00,0x1e,0x71,0x0f,0x8c,0xbf,0x1c,0x09,0x40
,0x7e,0x1d,0x09,0x38,0x7e,0x70,0x0f,0x8c,0xbf,0x15,0x3d,0x44,0x2a,0x16,0x3f,0x46,0x2a,0x06,0x49,0x2c,0x32,0x19,0x6a,0x1c,0xd1,0x14,0x01,0xa9,0x01,0x20,0x1b,0x3c
,0x28,0x21,0x27,0x3e,0x26,0x1f,0x1d,0x3e,0x28,0x1c,0x1f,0x38,0x28,0x1d,0x27,0x3a,0x26,0x1d,0x21,0x3a,0x28,0x16,0x03,0x2a,0x32,0x16,0x6a,0x1c,0xd1,0x19,0x01,0xa9
,0x01,0x00,0x01,0x9c,0xd8,0x05,0x1e,0x1c,0x00,0x7f,0x00,0x8c,0xbf,0x83,0x01,0xfe,0xbe,0x08,0x0a,0xee,0xd8,0x29,0x00,0x00,0x3c,0x0c,0x0e,0xee,0xd8,0x29,0x00,0x00
,0x40,0x10,0x12,0xee,0xd8,0x29,0x00,0x00,0x44,0x14,0x16,0xee,0xd8,0x29,0x00,0x00,0x48,0x10,0x00,0x89,0xd2,0x22,0x01,0x01,0x00,0x11,0x00,0x89,0xd2,0x23,0x01,0x01
,0x00,0x12,0x00,0x89,0xd2,0x22,0x03,0x01,0x00,0x13,0x00,0x89,0xd2,0x23,0x03,0x01,0x00,0x14,0x00,0x89,0xd2,0x22,0x05,0x01,0x00,0x15,0x00,0x89,0xd2,0x23,0x05,0x01
,0x00,0x16,0x00,0x89,0xd2,0x22,0x07,0x01,0x00,0x17,0x00,0x89,0xd2,0x23,0x07,0x01,0x00,0x18,0x00,0x89,0xd2,0x22,0x09,0x01,0x00,0x19,0x00,0x89,0xd2,0x23,0x09,0x01
,0x00,0x1a,0x00,0x89,0xd2,0x22,0x0b,0x01,0x00,0x1b,0x00,0x89,0xd2,0x23,0x0b,0x01,0x00,0x1c,0x00,0x89,0xd2,0x22,0x0d,0x01,0x00,0x1d,0x00,0x89,0xd2,0x23,0x0d,0x01
,0x00,0x1e,0x00,0x89,0xd2,0x22,0x0f,0x01,0x00,0x1f,0x00,0x89,0xd2,0x23,0x0f,0x01,0x00,0x7f,0x00,0x8c,0xbf,0x04,0x1e,0x8c,0xbe,0x1c,0x00,0x8a,0xd2,0x10,0x00,0x01
,0x00,0x1d,0x00,0x8a,0xd2,0x11,0x00,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x12,0x02,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x13,0x02,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x14,0x04,0x01
,0x00,0x1d,0x00,0x8a,0xd2,0x15,0x04,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x16,0x06,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x17,0x06,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x18,0x08,0x01
,0x00,0x1d,0x00,0x8a,0xd2,0x19,0x08,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x1a,0x0a,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x1b,0x0a,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x1c,0x0c,0x01
,0x00,0x1d,0x00,0x8a,0xd2,0x1d,0x0c,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x1e,0x0e,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x1f,0x0e,0x01,0x00,0x08,0x0a,0x9c,0xd8,0x29,0x3c,0x3e
,0x00,0x0c,0x0e,0x9c,0xd8,0x29,0x40,0x42,0x00,0x10,0x12,0x9c,0xd8,0x29,0x44,0x46,0x00,0x14,0x16,0x9c,0xd8,0x29,0x48,0x4a,0x00,0x24,0x01,0xfe,0xbe,0x00,0x00,0x9a
,0xd8,0x11,0x1c,0x00,0x00,0x00,0x00,0x54,0xdc,0x15,0x00,0x00,0x15,0x70,0x00,0x8c,0xbf,0x1c,0x2b,0x2a,0x2a,0x1d,0x2d,0x2c,0x2a,0x00,0x00,0x6c,0xd8,0x07,0x00,0x00
,0x7e,0x1d,0x09,0x38,0x7e,0x70,0x0f,0x8c,0xbf,0x15,0x3d,0x44,0x2a,0x16,0x3f,0x46,0x2a,0x06,0x49,0x2c,0x32,0x19,0x6a,0x1c,0xd1,0x14,0x01,0xa9,0x01,0x16,0x03,0x2a
,0x32,0x16,0x6a,0x1c,0xd1,0x19,0x01,0xa9,0x01,0x00,0x00,0x54,0xdc,0x15,0x00,0x00,0x15,0x20,0x1b,0x3c,0x28,0x21,0x27,0x3e,0x26,0x1f,0x1d,0x3e,0x28,0x1c,0x1f,0x38
,0x28,0x1d,0x27,0x3a,0x26,0x1d,0x21,0x3a,0x28,0x00,0x01,0x9c,0xd8,0x05,0x1e,0x1c,0x00,0x7f,0x00,0x8c,0xbf,0x83,0x01,0xfe,0xbe,0x08,0x0a,0xee,0xd8,0x29,0x00,0x00
,0x3c,0x0c,0x0e,0xee,0xd8,0x29,0x00,0x00,0x40,0x10,0x12,0xee,0xd8,0x29,0x00,0x00,0x44,0x14,0x16,0xee,0xd8,0x29,0x00,0x00,0x48,0x10,0x00,0x89,0xd2,0x22,0x01,0x01
,0x00,0x11,0x00,0x89,0xd2,0x23,0x01,0x01,0x00,0x12,0x00,0x89,0xd2,0x22,0x03,0x01,0x00,0x13,0x00,0x89,0xd2,0x23,0x03,0x01,0x00,0x14,0x00,0x89,0xd2,0x22,0x05,0x01
,0x00,0x15,0x00,0x89,0xd2,0x23,0x05,0x01,0x00,0x16,0x00,0x89,0xd2,0x22,0x07,0x01,0x00,0x17,0x00,0x89,0xd2,0x23,0x07,0x01,0x00,0x18,0x00,0x89,0xd2,0x22,0x09,0x01
,0x00,0x19,0x00,0x89,0xd2,0x23,0x09,0x01,0x00,0x1a,0x00,0x89,0xd2,0x22,0x0b,0x01,0x00,0x1b,0x00,0x89,0xd2,0x23,0x0b,0x01,0x00,0x1c,0x00,0x89,0xd2,0x22,0x0d,0x01
,0x00,0x1d,0x00,0x89,0xd2,0x23,0x0d,0x01,0x00,0x1e,0x00,0x89,0xd2,0x22,0x0f,0x01,0x00,0x1f,0x00,0x89,0xd2,0x23,0x0f,0x01,0x00,0x7f,0x00,0x8c,0xbf,0x04,0x1e,0x8c
,0xbe,0x08,0x0a,0x9c,0xd8,0x29,0x3c,0x3e,0x00,0x0c,0x0e,0x9c,0xd8,0x29,0x40,0x42,0x00,0x10,0x12,0x9c,0xd8,0x29,0x44,0x46,0x00,0x14,0x16,0x9c,0xd8,0x29,0x48,0x4a
,0x00,0x1c,0x00,0x8a,0xd2,0x10,0x00,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x11,0x00,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x12,0x02,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x13,0x02,0x01
,0x00,0x1c,0x00,0x8a,0xd2,0x14,0x04,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x15,0x04,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x16,0x06,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x17,0x06,0x01
,0x00,0x1c,0x00,0x8a,0xd2,0x18,0x08,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x19,0x08,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x1a,0x0a,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x1b,0x0a,0x01
,0x00,0x1c,0x00,0x8a,0xd2,0x1c,0x0c,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x1d,0x0c,0x01,0x00,0x1c,0x00,0x8a,0xd2,0x1e,0x0e,0x01,0x00,0x1d,0x00,0x8a,0xd2,0x1f,0x0e,0x01
,0x00,0x24,0x01,0xfe,0xbe,0x00,0x00,0x9a,0xd8,0x11,0x1c,0x00,0x00,0x7f,0x00,0x8c,0xbf,0x1c,0x2b,0x2a,0x2a,0x1d,0x2d,0x2c,0x2a,0x00,0x00,0x6c,0xd8,0x07,0x00,0x00
,0x1c,0x00,0x00,0x6c,0xd8,0x08,0x00,0x00,0x1d,0x00,0x00,0x9a,0xd8,0x11,0x15,0x00,0x00,0x7f,0x01,0x8c,0xbf,0x08,0x10,0xee,0xd8,0x11,0x00,0x00,0x1e,0x1c,0x4b,0x14
,0x2a,0x7f,0x00,0x8c,0xbf,0x20,0x3d,0x3c,0x2a,0x21,0x3f,0x3e,0x2a,0x0a,0x3b,0x14,0x2a,0x00,0x00,0x74,0xdc,0x1a,0x15,0x00,0x00,0xff,0x14,0x14,0x26,0xc0,0xff,0xff
,0x7f,0x00,0x00,0x74,0xdc,0x17,0x1e,0x00,0x00,0x02,0x80,0x06,0xbf,0x06,0x00,0x85,0xbf,0x02,0x81,0x82,0x81,0x24,0x03,0x4a,0x7e,0x80,0x02,0x2e,0x7e,0x0a,0x03,0x48
Expand Down Expand Up @@ -215,4 +215,4 @@ static unsigned char randomx_run_gfx803_bin[]={
,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x62
,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x0f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
static const int randomx_run_gfx803_bin_size=6568;
const int randomx_run_gfx803_bin_size=6568;
22 changes: 11 additions & 11 deletions src/amd/opencl/RandomX/randomx_run_gfx900.asm
Original file line number Diff line number Diff line change
Expand Up @@ -345,12 +345,13 @@ main_loop:

v_add_co_u32 v22, vcc, v6, v36
v_addc_co_u32 v25, vcc, v20, 0, vcc
v_add_co_u32 v21, vcc, v22, v1
v_addc_co_u32 v22, vcc, v25, 0, vcc
global_load_dwordx2 v[21:22], v[21:22], off
v_or_b32 v30, v32, v13
v_and_or_b32 v31, v33, v19, v14
v_or_b32 v28, v28, v15
v_and_or_b32 v29, v29, v19, v16
v_add_co_u32 v21, vcc, v22, v1
v_addc_co_u32 v22, vcc, v25, 0, vcc
ds_write2_b64 v5, v[30:31], v[28:29] offset1:1
s_waitcnt lgkmcnt(0)

Expand Down Expand Up @@ -387,6 +388,13 @@ main_loop:
# call JIT code
s_swappc_b64 s[12:13], s[4:5]

# Write out group F,E registers
# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22

# store VM integer registers
v_writelane_b32 v28, s16, 0
v_writelane_b32 v29, s17, 0
Expand All @@ -405,21 +413,13 @@ main_loop:
v_writelane_b32 v28, s30, 7
v_writelane_b32 v29, s31, 7

# Write out group F,E registers
# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22

# Restore execution mask
s_mov_b64 exec, s[36:37]

# Write out VM integer registers
ds_write_b64 v17, v[28:29]

global_load_dwordx2 v[21:22], v[21:22], off
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_waitcnt lgkmcnt(0)
v_xor_b32 v21, v28, v21
v_xor_b32 v22, v29, v22
ds_read_b32 v28, v7
Expand Down
Loading