Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[METAL] Fix memory leaks in Metal runtime #7714

Merged
merged 2 commits into from
Mar 23, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/runtime/contrib/random/mt_random_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,10 @@ class RandomEngine {
} else {
runtime::NDArray local = runtime::NDArray::Empty(
std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
FillData(&local.ToDLPack()->dl_tensor, size);
runtime::NDArray::CopyFromTo(&local.ToDLPack()->dl_tensor, data);
DLManagedTensor* dmt = local.ToDLPack();
echuraev marked this conversation as resolved.
Show resolved Hide resolved
FillData(&dmt->dl_tensor, size);
runtime::NDArray::CopyFromTo(&dmt->dl_tensor, data);
dmt->deleter(dmt);
}
}

Expand Down
258 changes: 137 additions & 121 deletions src/runtime/metal/metal_device_api.mm
Original file line number Diff line number Diff line change
Expand Up @@ -30,50 +30,54 @@
namespace metal {

MetalWorkspace* MetalWorkspace::Global() {
// NOTE: explicitly use new to avoid exit-time destruction of global state
// Global state will be recycled by OS as the process exits.
static MetalWorkspace* inst = new MetalWorkspace();
return inst;
@autoreleasepool {
// NOTE: explicitly use new to avoid exit-time destruction of global state
// Global state will be recycled by OS as the process exits.
static MetalWorkspace* inst = new MetalWorkspace();
return inst;
}
}

void MetalWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
this->Init();
size_t index = static_cast<size_t>(ctx.device_id);
if (kind == kExist) {
*rv = int(index < devices.size());
return;
}
ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
switch (kind) {
case kMaxThreadsPerBlock: {
*rv = static_cast<int>([devices[ctx.device_id] maxThreadsPerThreadgroup].width);
break;
@autoreleasepool {
this->Init();
size_t index = static_cast<size_t>(ctx.device_id);
if (kind == kExist) {
*rv = int(index < devices.size());
return;
}
case kWarpSize: {
// Set warp size to be 1 for safty reason.
*rv = 1;
break;
ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
switch (kind) {
case kMaxThreadsPerBlock: {
*rv = static_cast<int>([devices[ctx.device_id] maxThreadsPerThreadgroup].width);
break;
}
case kWarpSize: {
// Set warp size to be 1 for safty reason.
*rv = 1;
break;
}
case kMaxSharedMemoryPerBlock:
return;
case kComputeVersion:
return;
case kDeviceName:
return;
case kMaxClockRate:
return;
case kMultiProcessorCount:
return;
case kMaxThreadDimensions:
return;
case kExist:
return;
case kMaxRegistersPerBlock:
return;
case kGcnArch:
return;
case kApiVersion:
return;
}
case kMaxSharedMemoryPerBlock:
return;
case kComputeVersion:
return;
case kDeviceName:
return;
case kMaxClockRate:
return;
case kMultiProcessorCount:
return;
case kMaxThreadDimensions:
return;
case kExist:
return;
case kMaxRegistersPerBlock:
return;
case kGcnArch:
return;
case kApiVersion:
return;
}
}

Expand Down Expand Up @@ -106,7 +110,11 @@ int GetWarpSize(id<MTLDevice> dev) {
ICHECK(f != nil);
id<MTLComputePipelineState> state = [dev newComputePipelineStateWithFunction:f error:&error_msg];
ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
return static_cast<int>(state.threadExecutionWidth);
int size = static_cast<int>(state.threadExecutionWidth);
[state release];
[f release];
[lib release];
return size;
}

MetalWorkspace::~MetalWorkspace() {
Expand All @@ -127,14 +135,14 @@ int GetWarpSize(id<MTLDevice> dev) {
#if TARGET_OS_IPHONE
// on iPhone
id<MTLDevice> d = MTLCreateSystemDefaultDevice();
devices.push_back([d retain]);
queues.push_back([[d newCommandQueue] retain]);
devices.push_back(d);
queues.push_back([d newCommandQueue]);
#else
NSArray<id<MTLDevice> >* devs = MTLCopyAllDevices();
for (size_t i = 0; i < devs.count; ++i) {
id<MTLDevice> d = [devs objectAtIndex:i];
devices.push_back([d retain]);
queues.push_back([[d newCommandQueue] retain]);
devices.push_back(d);
queues.push_back([d newCommandQueue]);
LOG(INFO) << "Intializing Metal device " << i << ", name=" << [d.name UTF8String];
warp_size.push_back(GetWarpSize(d));
}
Expand All @@ -147,102 +155,110 @@ int GetWarpSize(id<MTLDevice> dev) {

void* MetalWorkspace::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
DLDataType type_hint) {
this->Init();
id<MTLDevice> dev = GetDevice(ctx);
// GPU memory only
MTLResourceOptions storage_mode = MTLResourceStorageModePrivate;
/*
#if TARGET_OS_IPHONE
storage_mode = MTLResourceStorageModeShared;
#else
storage_mode = MTLResourceStorageModeManaged;
#endif
*/
id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
ICHECK(buf != nil);
return (void*)(CFBridgingRetain(buf));
@autoreleasepool {
this->Init();
id<MTLDevice> dev = GetDevice(ctx);
// GPU memory only
MTLResourceOptions storage_mode = MTLResourceStorageModePrivate;
/*
#if TARGET_OS_IPHONE
storage_mode = MTLResourceStorageModeShared;
#else
storage_mode = MTLResourceStorageModeManaged;
#endif
*/
id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
ICHECK(buf != nil);
return (void*)(buf);
tqchen marked this conversation as resolved.
Show resolved Hide resolved
}
}

void MetalWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
// MTLBuffer PurgeableState should be set to empty before manual
// release in order to prevent memory leak
[(id<MTLBuffer>)ptr setPurgeableState:MTLPurgeableStateEmpty];
// release the ptr.
CFRelease(ptr);
@autoreleasepool {
// MTLBuffer PurgeableState should be set to empty before manual
// release in order to prevent memory leak
[(id<MTLBuffer>)ptr setPurgeableState:MTLPurgeableStateEmpty];
// release the ptr.
CFRelease(ptr);
}
}

void MetalWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to,
size_t to_offset, size_t size, TVMContext ctx_from,
TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) {
this->Init();
ICHECK(stream == nullptr);
TVMContext ctx = ctx_from;
if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
id<MTLCommandQueue> queue = GetCommandQueue(ctx);
id<MTLCommandBuffer> cb = [queue commandBuffer];
int from_dev_type = static_cast<int>(ctx_from.device_type);
int to_dev_type = static_cast<int>(ctx_to.device_type);
@autoreleasepool {
this->Init();
ICHECK(stream == nullptr);
TVMContext ctx = ctx_from;
if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
id<MTLCommandQueue> queue = GetCommandQueue(ctx);
id<MTLCommandBuffer> cb = [queue commandBuffer];
int from_dev_type = static_cast<int>(ctx_from.device_type);
int to_dev_type = static_cast<int>(ctx_to.device_type);

if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
[encoder copyFromBuffer:(__bridge id<MTLBuffer>)(from)
sourceOffset:from_offset
toBuffer:(__bridge id<MTLBuffer>)(to)destinationOffset:to_offset
size:size];
[encoder endEncoding];
[cb commit];
} else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) {
// copy to a local buffer before get into global buffer.
id<MTLBuffer> from_buf = (__bridge id<MTLBuffer>)(from);
if (from_buf.storageMode != MTLStorageModeShared) {
id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size);
if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
[encoder copyFromBuffer:from_buf
[encoder copyFromBuffer:(id<MTLBuffer>)(from)
sourceOffset:from_offset
toBuffer:temp
destinationOffset:0
size:size];
[encoder endEncoding];
[cb commit];
[cb waitUntilCompleted];
memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>([temp contents]), size);
} else {
memcpy(static_cast<char*>(to) + to_offset,
static_cast<char*>([from_buf contents]) + from_offset, size);
}
} else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) {
id<MTLBuffer> to_buf = (__bridge id<MTLBuffer>)(to);
if (to_buf.storageMode != MTLStorageModeShared) {
id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size);
memcpy([temp contents], static_cast<const char*>(from) + from_offset, size);
id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
[encoder copyFromBuffer:temp
sourceOffset:0
toBuffer:to_buf
destinationOffset:to_offset
toBuffer:(id<MTLBuffer>)(to)destinationOffset:to_offset
size:size];
[encoder endEncoding];
[cb commit];
[cb waitUntilCompleted];
} else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) {
// copy to a local buffer before get into global buffer.
id<MTLBuffer> from_buf = (id<MTLBuffer>)(from);
if (from_buf.storageMode != MTLStorageModeShared) {
id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size);
id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
[encoder copyFromBuffer:from_buf
sourceOffset:from_offset
toBuffer:temp
destinationOffset:0
size:size];
[encoder endEncoding];
[cb commit];
[cb waitUntilCompleted];
memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>([temp contents]), size);
} else {
memcpy(static_cast<char*>(to) + to_offset,
static_cast<char*>([from_buf contents]) + from_offset, size);
}
} else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) {
id<MTLBuffer> to_buf = (id<MTLBuffer>)(to);
if (to_buf.storageMode != MTLStorageModeShared) {
id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size);
memcpy([temp contents], static_cast<const char*>(from) + from_offset, size);
id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
[encoder copyFromBuffer:temp
sourceOffset:0
toBuffer:to_buf
destinationOffset:to_offset
size:size];
[encoder endEncoding];
[cb commit];
[cb waitUntilCompleted];
} else {
memcpy(static_cast<char*>([to_buf contents]) + to_offset,
static_cast<const char*>(from) + from_offset, size);
}
} else {
memcpy(static_cast<char*>([to_buf contents]) + to_offset,
static_cast<const char*>(from) + from_offset, size);
LOG(FATAL) << "Expect copy from/to Metal or between Metal"
<< ", from=" << from_dev_type << ", to=" << to_dev_type;
}
} else {
LOG(FATAL) << "Expect copy from/to Metal or between Metal"
<< ", from=" << from_dev_type << ", to=" << to_dev_type;
}
}

void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
ICHECK(stream == nullptr);
// commit an empty command buffer and wait until it completes.
id<MTLCommandQueue> queue = GetCommandQueue(ctx);
id<MTLCommandBuffer> cb = [queue commandBuffer];
[cb commit];
[cb waitUntilCompleted];
@autoreleasepool {
ICHECK(stream == nullptr);
// commit an empty command buffer and wait until it completes.
id<MTLCommandQueue> queue = GetCommandQueue(ctx);
id<MTLCommandBuffer> cb = [queue commandBuffer];
[cb commit];
[cb waitUntilCompleted];
}
}

void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
Expand All @@ -269,10 +285,10 @@ int GetWarpSize(id<MTLDevice> dev) {
if (temp_buffer_[ctx.device_id] == nil || temp_buffer_[ctx.device_id].length < size) {
id<MTLDevice> dev = MetalWorkspace::Global()->GetDevice(ctx);
if (temp_buffer_[ctx.device_id] != nil) {
[temp_buffer_[ctx.device_id] setPurgeableState:MTLPurgeableStateEmpty];
[temp_buffer_[ctx.device_id] release];
}
temp_buffer_[ctx.device_id] = [[dev newBufferWithLength:size
options:MTLStorageModeShared] retain];
temp_buffer_[ctx.device_id] = [dev newBufferWithLength:size options:MTLStorageModeShared];
}
return temp_buffer_[ctx.device_id];
}
Expand Down
Loading