diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc index 699f6bbcf376..81f46b2dd5d5 100644 --- a/src/runtime/contrib/random/mt_random_engine.cc +++ b/src/runtime/contrib/random/mt_random_engine.cc @@ -126,8 +126,9 @@ class RandomEngine { } else { runtime::NDArray local = runtime::NDArray::Empty( std::vector{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0}); - FillData(&local.ToDLPack()->dl_tensor, size); - runtime::NDArray::CopyFromTo(&local.ToDLPack()->dl_tensor, data); + DLTensor* tensor = const_cast(local.operator->()); + FillData(tensor, size); + runtime::NDArray::CopyFromTo(tensor, data); } } diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 0169a4c2ec28..3d7abd134035 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -30,50 +30,54 @@ namespace metal { MetalWorkspace* MetalWorkspace::Global() { - // NOTE: explicitly use new to avoid exit-time destruction of global state - // Global state will be recycled by OS as the process exits. - static MetalWorkspace* inst = new MetalWorkspace(); - return inst; + @autoreleasepool { + // NOTE: explicitly use new to avoid exit-time destruction of global state + // Global state will be recycled by OS as the process exits. + static MetalWorkspace* inst = new MetalWorkspace(); + return inst; + } } void MetalWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { - this->Init(); - size_t index = static_cast(ctx.device_id); - if (kind == kExist) { - *rv = int(index < devices.size()); - return; - } - ICHECK_LT(index, devices.size()) << "Invalid device id " << index; - switch (kind) { - case kMaxThreadsPerBlock: { - *rv = static_cast([devices[ctx.device_id] maxThreadsPerThreadgroup].width); - break; + @autoreleasepool { + this->Init(); + size_t index = static_cast(ctx.device_id); + if (kind == kExist) { + *rv = int(index < devices.size()); + return; } - case kWarpSize: { - // Set warp size to be 1 for safty reason. - *rv = 1; - break; + ICHECK_LT(index, devices.size()) << "Invalid device id " << index; + switch (kind) { + case kMaxThreadsPerBlock: { + *rv = static_cast([devices[ctx.device_id] maxThreadsPerThreadgroup].width); + break; + } + case kWarpSize: { + // Set warp size to be 1 for safty reason. + *rv = 1; + break; + } + case kMaxSharedMemoryPerBlock: + return; + case kComputeVersion: + return; + case kDeviceName: + return; + case kMaxClockRate: + return; + case kMultiProcessorCount: + return; + case kMaxThreadDimensions: + return; + case kExist: + return; + case kMaxRegistersPerBlock: + return; + case kGcnArch: + return; + case kApiVersion: + return; } - case kMaxSharedMemoryPerBlock: - return; - case kComputeVersion: - return; - case kDeviceName: - return; - case kMaxClockRate: - return; - case kMultiProcessorCount: - return; - case kMaxThreadDimensions: - return; - case kExist: - return; - case kMaxRegistersPerBlock: - return; - case kGcnArch: - return; - case kApiVersion: - return; } } @@ -106,7 +110,11 @@ int GetWarpSize(id dev) { ICHECK(f != nil); id state = [dev newComputePipelineStateWithFunction:f error:&error_msg]; ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String]; - return static_cast(state.threadExecutionWidth); + int size = static_cast(state.threadExecutionWidth); + [state release]; + [f release]; + [lib release]; + return size; } MetalWorkspace::~MetalWorkspace() { @@ -127,14 +135,14 @@ int GetWarpSize(id dev) { #if TARGET_OS_IPHONE // on iPhone id d = MTLCreateSystemDefaultDevice(); - devices.push_back([d retain]); - queues.push_back([[d newCommandQueue] retain]); + devices.push_back(d); + queues.push_back([d newCommandQueue]); #else NSArray >* devs = MTLCopyAllDevices(); for (size_t i = 0; i < devs.count; ++i) { id d = [devs objectAtIndex:i]; - devices.push_back([d retain]); - queues.push_back([[d newCommandQueue] retain]); + devices.push_back(d); + queues.push_back([d newCommandQueue]); LOG(INFO) << "Intializing Metal device " << i << ", name=" << [d.name UTF8String]; warp_size.push_back(GetWarpSize(d)); } @@ -147,102 +155,110 @@ int GetWarpSize(id dev) { void* MetalWorkspace::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) { - this->Init(); - id dev = GetDevice(ctx); - // GPU memory only - MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; - /* - #if TARGET_OS_IPHONE - storage_mode = MTLResourceStorageModeShared; - #else - storage_mode = MTLResourceStorageModeManaged; - #endif - */ - id buf = [dev newBufferWithLength:nbytes options:storage_mode]; - ICHECK(buf != nil); - return (void*)(CFBridgingRetain(buf)); + @autoreleasepool { + this->Init(); + id dev = GetDevice(ctx); + // GPU memory only + MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; + /* + #if TARGET_OS_IPHONE + storage_mode = MTLResourceStorageModeShared; + #else + storage_mode = MTLResourceStorageModeManaged; + #endif + */ + id buf = [dev newBufferWithLength:nbytes options:storage_mode]; + ICHECK(buf != nil); + return (void*)(buf); + } } void MetalWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { - // MTLBuffer PurgeableState should be set to empty before manual - // release in order to prevent memory leak - [(id)ptr setPurgeableState:MTLPurgeableStateEmpty]; - // release the ptr. - CFRelease(ptr); + @autoreleasepool { + // MTLBuffer PurgeableState should be set to empty before manual + // release in order to prevent memory leak + [(id)ptr setPurgeableState:MTLPurgeableStateEmpty]; + // release the ptr. + CFRelease(ptr); + } } void MetalWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) { - this->Init(); - ICHECK(stream == nullptr); - TVMContext ctx = ctx_from; - if (ctx_from.device_type == kDLCPU) ctx = ctx_to; - id queue = GetCommandQueue(ctx); - id cb = [queue commandBuffer]; - int from_dev_type = static_cast(ctx_from.device_type); - int to_dev_type = static_cast(ctx_to.device_type); + @autoreleasepool { + this->Init(); + ICHECK(stream == nullptr); + TVMContext ctx = ctx_from; + if (ctx_from.device_type == kDLCPU) ctx = ctx_to; + id queue = GetCommandQueue(ctx); + id cb = [queue commandBuffer]; + int from_dev_type = static_cast(ctx_from.device_type); + int to_dev_type = static_cast(ctx_to.device_type); - if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { - ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; - id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:(__bridge id)(from) - sourceOffset:from_offset - toBuffer:(__bridge id)(to)destinationOffset:to_offset - size:size]; - [encoder endEncoding]; - [cb commit]; - } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) { - // copy to a local buffer before get into global buffer. - id from_buf = (__bridge id)(from); - if (from_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size); + if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { + ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:from_buf + [encoder copyFromBuffer:(id)(from) sourceOffset:from_offset - toBuffer:temp - destinationOffset:0 - size:size]; - [encoder endEncoding]; - [cb commit]; - [cb waitUntilCompleted]; - memcpy(static_cast(to) + to_offset, static_cast([temp contents]), size); - } else { - memcpy(static_cast(to) + to_offset, - static_cast([from_buf contents]) + from_offset, size); - } - } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { - id to_buf = (__bridge id)(to); - if (to_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size); - memcpy([temp contents], static_cast(from) + from_offset, size); - id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:temp - sourceOffset:0 - toBuffer:to_buf - destinationOffset:to_offset + toBuffer:(id)(to)destinationOffset:to_offset size:size]; [encoder endEncoding]; [cb commit]; - [cb waitUntilCompleted]; + } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) { + // copy to a local buffer before get into global buffer. + id from_buf = (id)(from); + if (from_buf.storageMode != MTLStorageModeShared) { + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size); + id encoder = [cb blitCommandEncoder]; + [encoder copyFromBuffer:from_buf + sourceOffset:from_offset + toBuffer:temp + destinationOffset:0 + size:size]; + [encoder endEncoding]; + [cb commit]; + [cb waitUntilCompleted]; + memcpy(static_cast(to) + to_offset, static_cast([temp contents]), size); + } else { + memcpy(static_cast(to) + to_offset, + static_cast([from_buf contents]) + from_offset, size); + } + } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { + id to_buf = (id)(to); + if (to_buf.storageMode != MTLStorageModeShared) { + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size); + memcpy([temp contents], static_cast(from) + from_offset, size); + id encoder = [cb blitCommandEncoder]; + [encoder copyFromBuffer:temp + sourceOffset:0 + toBuffer:to_buf + destinationOffset:to_offset + size:size]; + [encoder endEncoding]; + [cb commit]; + [cb waitUntilCompleted]; + } else { + memcpy(static_cast([to_buf contents]) + to_offset, + static_cast(from) + from_offset, size); + } } else { - memcpy(static_cast([to_buf contents]) + to_offset, - static_cast(from) + from_offset, size); + LOG(FATAL) << "Expect copy from/to Metal or between Metal" + << ", from=" << from_dev_type << ", to=" << to_dev_type; } - } else { - LOG(FATAL) << "Expect copy from/to Metal or between Metal" - << ", from=" << from_dev_type << ", to=" << to_dev_type; } } void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { - ICHECK(stream == nullptr); - // commit an empty command buffer and wait until it completes. - id queue = GetCommandQueue(ctx); - id cb = [queue commandBuffer]; - [cb commit]; - [cb waitUntilCompleted]; + @autoreleasepool { + ICHECK(stream == nullptr); + // commit an empty command buffer and wait until it completes. + id queue = GetCommandQueue(ctx); + id cb = [queue commandBuffer]; + [cb commit]; + [cb waitUntilCompleted]; + } } void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { @@ -269,10 +285,10 @@ int GetWarpSize(id dev) { if (temp_buffer_[ctx.device_id] == nil || temp_buffer_[ctx.device_id].length < size) { id dev = MetalWorkspace::Global()->GetDevice(ctx); if (temp_buffer_[ctx.device_id] != nil) { + [temp_buffer_[ctx.device_id] setPurgeableState:MTLPurgeableStateEmpty]; [temp_buffer_[ctx.device_id] release]; } - temp_buffer_[ctx.device_id] = [[dev newBufferWithLength:size - options:MTLStorageModeShared] retain]; + temp_buffer_[ctx.device_id] = [dev newBufferWithLength:size options:MTLStorageModeShared]; } return temp_buffer_[ctx.device_id]; } diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm index 8f1fde86f074..c7e2d8b7b4bc 100644 --- a/src/runtime/metal/metal_module.mm +++ b/src/runtime/metal/metal_module.mm @@ -113,7 +113,6 @@ void SaveToBinary(dmlc::Stream* stream) final { LOG(FATAL) << "Fail to compile metal lib:" << [[err_msg localizedDescription] UTF8String]; } } - [e.lib retain]; } id f = [e.lib newFunctionWithName:[NSString stringWithUTF8String:func_name.c_str()]]; @@ -123,11 +122,13 @@ void SaveToBinary(dmlc::Stream* stream) final { ICHECK(state != nil) << "cannot get state:" << " for function " << func_name << [[err_msg localizedDescription] UTF8String]; + [f release]; // The state.threadExecutionWidth can change dynamically according // to the resource constraint in kernel, so it is not strictly hold // Turn of warp aware optimziation for now. // ICHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]); - e.smap[func_name] = [state retain]; + if (e.smap[func_name] != nil) [e.smap[func_name] release]; + e.smap[func_name] = state; return state; } @@ -181,31 +182,36 @@ void Init(MetalModuleNode* m, ObjectPtr sptr, const std::string& func_na } // invoke the function with void arguments void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const { - metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); - int device_id = t->context.device_id; - if (scache_[device_id] == nil) { - scache_[device_id] = m_->GetPipelineState(device_id, func_name_); - } - ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); - id queue = w_->GetCommandQueue(t->context); - id cb = [queue commandBuffer]; - id encoder = [cb computeCommandEncoder]; - [encoder setComputePipelineState:scache_[device_id]]; - for (size_t i = 0; i < num_buffer_args_; ++i) { - void* buf = args[static_cast(i)]; - [encoder setBuffer:(__bridge id)(buf) offset:0 atIndex:i]; - } - if (num_pack_args_ != 0) { - [encoder setBytes:pack_args - length:num_pack_args_ * sizeof(ArgUnion64) - atIndex:num_buffer_args_]; + @autoreleasepool { + metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); + int device_id = t->context.device_id; + if (scache_[device_id] == nil) { + scache_[device_id] = m_->GetPipelineState(device_id, func_name_); + } + ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); + int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2); + auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup; + CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup); + id queue = w_->GetCommandQueue(t->context); + id cb = [queue commandBuffer]; + id encoder = [cb computeCommandEncoder]; + [encoder setComputePipelineState:scache_[device_id]]; + for (size_t i = 0; i < num_buffer_args_; ++i) { + void* buf = args[static_cast(i)]; + [encoder setBuffer:(id)(buf) offset:0 atIndex:i]; + } + if (num_pack_args_ != 0) { + [encoder setBytes:pack_args + length:num_pack_args_ * sizeof(ArgUnion64) + atIndex:num_buffer_args_]; + } + // launch + MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); + MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2)); + [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock]; + [encoder endEncoding]; + [cb commit]; } - // launch - MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); - MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2)); - [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock]; - [encoder endEncoding]; - [cb commit]; } private: @@ -230,23 +236,27 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons PackedFunc MetalModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { - ICHECK_EQ(sptr_to_self.get(), this); - ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; - auto it = fmap_.find(name); - if (it == fmap_.end()) return PackedFunc(); - const FunctionInfo& info = it->second; - MetalWrappedFunc f; - size_t num_buffer_args = NumBufferArgs(info.arg_types); - f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args, - info.thread_axis_tags); - return PackFuncNonBufferArg(f, info.arg_types); + @autoreleasepool { + ICHECK_EQ(sptr_to_self.get(), this); + ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const FunctionInfo& info = it->second; + MetalWrappedFunc f; + size_t num_buffer_args = NumBufferArgs(info.arg_types); + f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args, + info.thread_axis_tags); + return PackFuncNonBufferArg(f, info.arg_types); + } } Module MetalModuleCreate(std::string data, std::string fmt, std::unordered_map fmap, std::string source) { - metal::MetalWorkspace::Global()->Init(); - auto n = make_object(data, fmt, fmap, source); - return Module(n); + @autoreleasepool { + metal::MetalWorkspace::Global()->Init(); + auto n = make_object(data, fmt, fmap, source); + return Module(n); + } } // Load module from module.