diff --git a/DxDispatch/CMakeLists.txt b/DxDispatch/CMakeLists.txt index 500cdb7e..f3fcd3dd 100644 --- a/DxDispatch/CMakeLists.txt +++ b/DxDispatch/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.18) -project(dxdispatch VERSION 0.15.1 LANGUAGES CXX) +project(dxdispatch VERSION 0.15.3 LANGUAGES CXX) # ============================================================================== # External Libraries/Helpers diff --git a/DxDispatch/README.md b/DxDispatch/README.md index 60b6a41c..7e311f13 100644 --- a/DxDispatch/README.md +++ b/DxDispatch/README.md @@ -94,7 +94,7 @@ The default redistributable versions of components (e.g. nuget, archives): - **Direct3D 12 (nuget)**: [Microsoft.Direct3D.D3D12 (1.610.2)](https://www.nuget.org/packages/Microsoft.Direct3D.D3D12/1.610.2) - 2023/04/20 - **DX Compiler (archive)**: [December 2022 (v1.7.2212.1)](https://github.com/microsoft/DirectXShaderCompiler/releases/tag/v1.7.2212.1) - 2023/03/02 - **PIX Event Runtime (nuget)**: [WinPixEventRuntime (1.0.230302001)](https://www.nuget.org/packages/WinPixEventRuntime/1.0.230302001) - 2023/03/02 -- **ONNX Runtime (nuget)**: [Microsoft.ML.OnnxRuntime.DirectML (1.14.1)](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.DirectML/1.14.1) - 2023/02/27 +- **ONNX Runtime (nuget)**: [Microsoft.ML.OnnxRuntime.DirectML (1.15.0)](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.DirectML/1.15.0) - 2023/05/24 Configuration is done using CMake cache variables. For example, Direct3D can be switched to a system dependency by adding `-DDXD_DIRECT3D_TYPE=winsdk` to the command line when first configuring the project. Use `cmake-gui` or `ccmake` to view the available variables. diff --git a/DxDispatch/cgmanifest.json b/DxDispatch/cgmanifest.json index 156e6c55..751d2835 100644 --- a/DxDispatch/cgmanifest.json +++ b/DxDispatch/cgmanifest.json @@ -15,7 +15,7 @@ "type": "nuget", "nuget": { "name": "Microsoft.Direct3D.D3D12", - "version": "1.608.2" + "version": "1.610.2" } } }, @@ -24,7 +24,7 @@ "type": "nuget", "nuget": { "name": "Microsoft.AI.DirectML", - "version": "1.10.1" + "version": "1.12.0" } } }, @@ -33,8 +33,8 @@ "type": "other", "other": { "name": "DirectX Shader Compiler", - "version": "2022_12_16", - "downloadUrl": "https://github.com/microsoft/DirectXShaderCompiler/releases/download/v1.7.2212/dxc_2022_12_16.zip" + "version": "2023_03_01", + "downloadUrl": "https://github.com/microsoft/DirectXShaderCompiler/releases/download/v1.7.2212.1/dxc_2023_03_01.zip" } } }, @@ -89,7 +89,7 @@ "type": "nuget", "nuget": { "name": "WinPixEventRuntime", - "version": "1.0.220124001" + "version": "1.0.230302001" } } }, @@ -116,7 +116,7 @@ "type": "nuget", "nuget": { "name": "Microsoft.ML.OnnxRuntime.DirectML", - "version": "1.14.1" + "version": "1.15.0" } } } diff --git a/DxDispatch/cmake/onnxruntime.cmake b/DxDispatch/cmake/onnxruntime.cmake index f23257cc..933df0ff 100644 --- a/DxDispatch/cmake/onnxruntime.cmake +++ b/DxDispatch/cmake/onnxruntime.cmake @@ -52,13 +52,13 @@ function(init_onnxruntime_cache_variables prefix) # _ONNXRUNTIME_NUGET_VERSION set(${prefix}_ONNXRUNTIME_NUGET_VERSION - 1.14.1 + 1.15.0 CACHE STRING "Version of the ONNX Runtime NuGet package (TYPE == nuget)." ) # _ONNXRUNTIME_NUGET_HASH set(${prefix}_ONNXRUNTIME_NUGET_HASH - c8ae7623385b19cd5de968d0df5383e13b97d1b3a6771c9177eac15b56013a5a + C168D1C9C73E14041DF904E4B38F01A7F955AEF94AAFDEB4ED996F0656054062 CACHE STRING "SHA256 hash of the ONNX Runtime NuGet package (TYPE == nuget)." ) diff --git a/DxDispatch/src/dxdispatch/Executor.cpp b/DxDispatch/src/dxdispatch/Executor.cpp index 8e93fa30..255eb209 100644 --- a/DxDispatch/src/dxdispatch/Executor.cpp +++ b/DxDispatch/src/dxdispatch/Executor.cpp @@ -357,6 +357,7 @@ std::ostream& operator<<(std::ostream& os, const BufferDataView& view) { uint32_t elementCount = view.desc.initialValues.size() / Device::GetSizeInBytes(view.desc.initialValuesDataType); auto values = reinterpret_cast(view.byteValues.data()); + printf("elementCount=%d\n", elementCount); for (uint32_t elementIndex = 0; elementIndex < elementCount; elementIndex++) { os << values[elementIndex]; @@ -364,6 +365,7 @@ std::ostream& operator<<(std::ostream& os, const BufferDataView& view) { os << ", "; } + } return os; } @@ -399,7 +401,16 @@ void Executor::operator()(const Model::PrintCommand& command) auto outputValues = m_device->Download(resource.Get()); auto& resourceDesc = m_model.GetResource(command.resourceName); auto& bufferDesc = std::get(resourceDesc.value); - LogInfo(fmt::format("Resource '{}': {}", command.resourceName, ToString(outputValues, bufferDesc))); + // print only output tensor + if (command.resourceName == "output") + { + LogInfo(fmt::format("Resource '{}': {}", command.resourceName, ToString(outputValues, bufferDesc))); + } + if (command.resourceName == "stackedKeyValue") + { + LogInfo(fmt::format("Resource '{}': {}", command.resourceName, ToString(outputValues, bufferDesc))); + } + } catch (const std::exception& e) { @@ -441,7 +452,7 @@ void Executor::operator()(const Model::WriteFileCommand& command) } file.write(reinterpret_cast(fileData.data()), fileData.size()); - LogInfo(fmt::format("Resource '{}' written to '{}'", command.resourceName, command.targetPath)); + //LogInfo(fmt::format("Resource '{}' written to '{}'", command.resourceName, command.targetPath)); } catch (const std::exception& e) { diff --git a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp index 22021bb3..38698b95 100644 --- a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp @@ -472,9 +472,9 @@ void OnnxDispatchable::Bind(const Bindings& jsonBindings, uint32_t iteration) { for (auto& binding : m_mergedBindings) { - LogInfo(fmt::format("{} Tensor '{}':", (binding.isInput ? "Input" : "Output"), binding.name)); - LogInfo(fmt::format(" Resource = {}", binding.resourceType)); - LogInfo(fmt::format(" Data Type = {}", GetOnnxTensorTypeString(binding.dataType))); + //LogInfo(fmt::format("{} Tensor '{}':", (binding.isInput ? "Input" : "Output"), binding.name)); + //LogInfo(fmt::format(" Resource = {}", binding.resourceType)); + //LogInfo(fmt::format(" Data Type = {}", GetOnnxTensorTypeString(binding.dataType))); std::string shapeString = "["; for (size_t i = 0; i < binding.shape.size(); i++) { diff --git a/DxDispatch/src/model/JsonParsers.cpp b/DxDispatch/src/model/JsonParsers.cpp index a62923be..9a326bdc 100644 --- a/DxDispatch/src/model/JsonParsers.cpp +++ b/DxDispatch/src/model/JsonParsers.cpp @@ -3,6 +3,8 @@ #include "StdSupport.h" #include "NpyReaderWriter.h" +#include + #ifndef WIN32 #define _stricmp strcasecmp #endif @@ -1049,6 +1051,51 @@ std::vector GenerateInitialValuesFromSequence(DML_TENSOR_DATA_TYPE da } } +std::vector GenerateInitialValuesFromRandom(DML_TENSOR_DATA_TYPE dataType, const rapidjson::Value& object) +{ + auto valueCount = ParseUInt32Field(object, "valueCount"); + auto seed = ParseUInt32Field(object, "seed"); + auto valueMin = ParseFloat32Field(object, "min"); + auto valueMax = ParseFloat32Field(object, "max"); + + // randomize data + std::mt19937 random_generator(seed); // static, create it once! + std::uniform_real_distribution uniform_distribution(valueMin, valueMax); + + auto AsBytes = [&](auto& parser, auto defaultValue)->std::vector + { + + std::vector allBytes; + allBytes.reserve(sizeof(defaultValue) * valueCount); + for (size_t i = 0; i < valueCount; i++) + { + const auto f32 = uniform_distribution(random_generator); + const auto value = static_cast(f32); + for (auto byte : gsl::as_bytes(gsl::make_span(&value, 1))) + { + allBytes.push_back(byte); + } + } + return allBytes; + }; + + switch (dataType) + { + case DML_TENSOR_DATA_TYPE_FLOAT16: return AsBytes(ParseFloat16Field, half_float::half(0)); + case DML_TENSOR_DATA_TYPE_FLOAT32: return AsBytes(ParseFloat32Field, 0.0f); + case DML_TENSOR_DATA_TYPE_FLOAT64: return AsBytes(ParseFloat64Field, 0.0); + case DML_TENSOR_DATA_TYPE_UINT8: return AsBytes(ParseUInt8Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_UINT16: return AsBytes(ParseUInt16Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_UINT32: return AsBytes(ParseUInt32Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_UINT64: return AsBytes(ParseUInt64Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_INT8: return AsBytes(ParseInt8Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_INT16: return AsBytes(ParseInt16Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_INT32: return AsBytes(ParseInt32Field, static_cast(0)); + case DML_TENSOR_DATA_TYPE_INT64: return AsBytes(ParseInt64Field, static_cast(0)); + default: throw std::invalid_argument(fmt::format("Invalid tensor data type.")); + } +} + std::filesystem::path ResolveInputFilePath(const std::filesystem::path& parentPath, std::string_view sourcePath) { auto filePathRelativeToParent = std::filesystem::absolute(parentPath / sourcePath); @@ -1162,6 +1209,11 @@ Model::BufferDesc ParseModelBufferDesc(const std::filesystem::path& parentPath, ensureInitialValuesDataType(); buffer.initialValues = GenerateInitialValuesFromSequence(buffer.initialValuesDataType, initialValuesField->value); } + else if (initialValuesField->value.HasMember("seed")) + { + ensureInitialValuesDataType(); + buffer.initialValues = GenerateInitialValuesFromRandom(buffer.initialValuesDataType, initialValuesField->value); + } // e.g. "initialValues": { "sourcePath": "inputFile.npy" } else if (initialValuesField->value.HasMember("sourcePath")) { diff --git a/DxDispatch/src/model/JsonParsersGenerated.cpp b/DxDispatch/src/model/JsonParsersGenerated.cpp index 1d0c78c7..e77bff5c 100644 --- a/DxDispatch/src/model/JsonParsersGenerated.cpp +++ b/DxDispatch/src/model/JsonParsersGenerated.cpp @@ -219,6 +219,7 @@ DML_OPERATOR_TYPE ParseDmlOperatorType(const rapidjson::Value& value) if (!strcmp(valueString, "DML_OPERATOR_RESAMPLE2") || !strcmp(valueString, "RESAMPLE2")) { return DML_OPERATOR_RESAMPLE2; } if (!strcmp(valueString, "DML_OPERATOR_RESAMPLE_GRAD1") || !strcmp(valueString, "RESAMPLE_GRAD1")) { return DML_OPERATOR_RESAMPLE_GRAD1; } if (!strcmp(valueString, "DML_OPERATOR_DIAGONAL_MATRIX1") || !strcmp(valueString, "DIAGONAL_MATRIX1")) { return DML_OPERATOR_DIAGONAL_MATRIX1; } + if (!strcmp(valueString, "DML_OPERATOR_MULTIHEAD_ATTENTION") || !strcmp(valueString, "MULTIHEAD_ATTENTION")) { return DML_OPERATOR_MULTIHEAD_ATTENTION; } throw std::invalid_argument(fmt::format("'{}' is not a recognized value for DML_OPERATOR_TYPE.", valueString)); } @@ -429,6 +430,10 @@ DML_FEATURE_LEVEL ParseDmlFeatureLevel(const rapidjson::Value& value) if (!strcmp(valueString, "DML_FEATURE_LEVEL_4_0") || !strcmp(valueString, "4_0")) { return DML_FEATURE_LEVEL_4_0; } if (!strcmp(valueString, "DML_FEATURE_LEVEL_4_1") || !strcmp(valueString, "4_1")) { return DML_FEATURE_LEVEL_4_1; } if (!strcmp(valueString, "DML_FEATURE_LEVEL_5_0") || !strcmp(valueString, "5_0")) { return DML_FEATURE_LEVEL_5_0; } + if (!strcmp(valueString, "DML_FEATURE_LEVEL_5_1") || !strcmp(valueString, "5_1")) { return DML_FEATURE_LEVEL_5_1; } + if (!strcmp(valueString, "DML_FEATURE_LEVEL_5_2") || !strcmp(valueString, "5_2")) { return DML_FEATURE_LEVEL_5_2; } + if (!strcmp(valueString, "DML_FEATURE_LEVEL_6_0") || !strcmp(valueString, "6_0")) { return DML_FEATURE_LEVEL_6_0; } + if (!strcmp(valueString, "DML_FEATURE_LEVEL_6_1") || !strcmp(valueString, "6_1")) { return DML_FEATURE_LEVEL_6_1; } throw std::invalid_argument(fmt::format("'{}' is not a recognized value for DML_FEATURE_LEVEL.", valueString)); } @@ -535,6 +540,28 @@ DML_RANDOM_GENERATOR_TYPE ParseDmlRandomGeneratorTypeField(const rapidjson::Valu }); } +DML_MULTIHEAD_ATTENTION_MASK_TYPE ParseDmlMultiheadAttentionMaskType(const rapidjson::Value& value) +{ + if (value.GetType() != rapidjson::Type::kStringType) + { + throw std::invalid_argument("DML_MULTIHEAD_ATTENTION_MASK_TYPE must be a string."); + } + auto valueString = value.GetString(); + if (!strcmp(valueString, "DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE") || !strcmp(valueString, "NONE")) { return DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE; } + if (!strcmp(valueString, "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH") || !strcmp(valueString, "KEY_SEQUENCE_LENGTH")) { return DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH; } + if (!strcmp(valueString, "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START") || !strcmp(valueString, "KEY_SEQUENCE_END_START")) { return DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START; } + if (!strcmp(valueString, "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END") || !strcmp(valueString, "KEY_QUERY_SEQUENCE_LENGTH_START_END")) { return DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END; } + if (!strcmp(valueString, "DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN") || !strcmp(valueString, "BOOLEAN")) { return DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN; } + throw std::invalid_argument(fmt::format("'{}' is not a recognized value for DML_MULTIHEAD_ATTENTION_MASK_TYPE.", valueString)); +} + +DML_MULTIHEAD_ATTENTION_MASK_TYPE ParseDmlMultiheadAttentionMaskTypeField(const rapidjson::Value& object, std::string_view fieldName, bool required, DML_MULTIHEAD_ATTENTION_MASK_TYPE defaultValue) +{ + return ParseFieldHelper(object, fieldName, required, defaultValue, [](auto& value){ + return ParseDmlMultiheadAttentionMaskType(value); + }); +} + // ==================================================================================================== // DIRECTML FLAGS // ==================================================================================================== @@ -3981,6 +4008,54 @@ Model::DmlDispatchableDesc::BindPoints GetBindPoints(const DML_DIAGONAL_MATRIX1_ return bindPoints; } +DML_OPERATOR_DESC* ParseDmlMultiheadAttentionOperatorDesc(const rapidjson::Value& value, bool fused, BucketAllocator& allocator) +{ + if (!value.IsObject()) { throw std::invalid_argument("Expected a valid JSON object."); } + auto desc = allocator.Allocate(); + desc->QueryTensor = fused ? nullptr : ParseDmlTensorDescField(value, "QueryTensor", allocator, false); + desc->KeyTensor = fused ? nullptr : ParseDmlTensorDescField(value, "KeyTensor", allocator, false); + desc->ValueTensor = fused ? nullptr : ParseDmlTensorDescField(value, "ValueTensor", allocator, false); + desc->StackedQueryKeyTensor = fused ? nullptr : ParseDmlTensorDescField(value, "StackedQueryKeyTensor", allocator, false); + desc->StackedKeyValueTensor = fused ? nullptr : ParseDmlTensorDescField(value, "StackedKeyValueTensor", allocator, false); + desc->StackedQueryKeyValueTensor = fused ? nullptr : ParseDmlTensorDescField(value, "StackedQueryKeyValueTensor", allocator, false); + desc->BiasTensor = fused ? nullptr : ParseDmlTensorDescField(value, "BiasTensor", allocator, false); + desc->MaskTensor = fused ? nullptr : ParseDmlTensorDescField(value, "MaskTensor", allocator, false); + desc->RelativePositionBiasTensor = fused ? nullptr : ParseDmlTensorDescField(value, "RelativePositionBiasTensor", allocator, false); + desc->PastKeyTensor = fused ? nullptr : ParseDmlTensorDescField(value, "PastKeyTensor", allocator, false); + desc->PastValueTensor = fused ? nullptr : ParseDmlTensorDescField(value, "PastValueTensor", allocator, false); + desc->OutputTensor = fused ? nullptr : ParseDmlTensorDescField(value, "OutputTensor", allocator, true); + desc->OutputPresentKeyTensor = fused ? nullptr : ParseDmlTensorDescField(value, "OutputPresentKeyTensor", allocator, false); + desc->OutputPresentValueTensor = fused ? nullptr : ParseDmlTensorDescField(value, "OutputPresentValueTensor", allocator, false); + desc->Scale = ParseFloat32Field(value, "Scale", true); + desc->MaskFilterValue = ParseFloat32Field(value, "MaskFilterValue", true); + desc->HeadCount = ParseUInt32Field(value, "HeadCount", true); + desc->MaskType = ParseDmlMultiheadAttentionMaskTypeField(value, "MaskType", true, {}); + auto opDesc = allocator.Allocate(); + opDesc->Type = DML_OPERATOR_MULTIHEAD_ATTENTION; + opDesc->Desc = desc; + return opDesc; +} + +Model::DmlDispatchableDesc::BindPoints GetBindPoints(const DML_MULTIHEAD_ATTENTION_OPERATOR_DESC& desc) +{ + Model::DmlDispatchableDesc::BindPoints bindPoints = {}; + bindPoints.inputs.push_back({"QueryTensor", 1, false}); + bindPoints.inputs.push_back({"KeyTensor", 1, false}); + bindPoints.inputs.push_back({"ValueTensor", 1, false}); + bindPoints.inputs.push_back({"StackedQueryKeyTensor", 1, false}); + bindPoints.inputs.push_back({"StackedKeyValueTensor", 1, false}); + bindPoints.inputs.push_back({"StackedQueryKeyValueTensor", 1, false}); + bindPoints.inputs.push_back({"BiasTensor", 1, false}); + bindPoints.inputs.push_back({"MaskTensor", 1, false}); + bindPoints.inputs.push_back({"RelativePositionBiasTensor", 1, false}); + bindPoints.inputs.push_back({"PastKeyTensor", 1, false}); + bindPoints.inputs.push_back({"PastValueTensor", 1, false}); + bindPoints.outputs.push_back({"OutputTensor", 1, true}); + bindPoints.outputs.push_back({"OutputPresentKeyTensor", 1, false}); + bindPoints.outputs.push_back({"OutputPresentValueTensor", 1, false}); + return bindPoints; +} + DML_OPERATOR_DESC* ParseDmlActivationEluOperatorDesc(const rapidjson::Value& value, bool fused, BucketAllocator& allocator) { if (!value.IsObject()) { throw std::invalid_argument("Expected a valid JSON object."); } @@ -4651,6 +4726,7 @@ DML_OPERATOR_DESC* ParseDmlOperatorDesc(const rapidjson::Value& value, bool fuse if (!strcmp(type, "DML_OPERATOR_RESAMPLE2") || !strcmp(type, "RESAMPLE2")) return ParseDmlResample2OperatorDesc(descValue, fused, allocator); if (!strcmp(type, "DML_OPERATOR_RESAMPLE_GRAD1") || !strcmp(type, "RESAMPLE_GRAD1")) return ParseDmlResampleGrad1OperatorDesc(descValue, fused, allocator); if (!strcmp(type, "DML_OPERATOR_DIAGONAL_MATRIX1") || !strcmp(type, "DIAGONAL_MATRIX1")) return ParseDmlDiagonalMatrix1OperatorDesc(descValue, fused, allocator); + if (!strcmp(type, "DML_OPERATOR_MULTIHEAD_ATTENTION") || !strcmp(type, "MULTIHEAD_ATTENTION")) return ParseDmlMultiheadAttentionOperatorDesc(descValue, fused, allocator); if (!strcmp(type, "DML_OPERATOR_ACTIVATION_ELU") || !strcmp(type, "ACTIVATION_ELU")) return ParseDmlActivationEluOperatorDesc(descValue, fused, allocator); if (!strcmp(type, "DML_OPERATOR_ACTIVATION_CELU") || !strcmp(type, "ACTIVATION_CELU")) return ParseDmlActivationCeluOperatorDesc(descValue, fused, allocator); if (!strcmp(type, "DML_OPERATOR_ACTIVATION_HARDMAX") || !strcmp(type, "ACTIVATION_HARDMAX")) return ParseDmlActivationHardmaxOperatorDesc(descValue, fused, allocator); @@ -4821,6 +4897,7 @@ Model::DmlDispatchableDesc::BindPoints GetBindPoints(const DML_OPERATOR_DESC& de case DML_OPERATOR_RESAMPLE2: return GetBindPoints(*reinterpret_cast(desc.Desc)); case DML_OPERATOR_RESAMPLE_GRAD1: return GetBindPoints(*reinterpret_cast(desc.Desc)); case DML_OPERATOR_DIAGONAL_MATRIX1: return GetBindPoints(*reinterpret_cast(desc.Desc)); + case DML_OPERATOR_MULTIHEAD_ATTENTION: return GetBindPoints(*reinterpret_cast(desc.Desc)); case DML_OPERATOR_ACTIVATION_ELU: return GetBindPoints(*reinterpret_cast(desc.Desc)); case DML_OPERATOR_ACTIVATION_CELU: return GetBindPoints(*reinterpret_cast(desc.Desc)); case DML_OPERATOR_ACTIVATION_HARDMAX: return GetBindPoints(*reinterpret_cast(desc.Desc)); diff --git a/DxDispatch/tools/GenerateParsingHelpers.ps1 b/DxDispatch/tools/GenerateParsingHelpers.ps1 index 6b334a2b..d03c4553 100644 --- a/DxDispatch/tools/GenerateParsingHelpers.ps1 +++ b/DxDispatch/tools/GenerateParsingHelpers.ps1 @@ -1,7 +1,7 @@ param ( [string]$SchemaFilePath = "$PSScriptRoot\DmlSchema.json", - [string]$MaxFeatureLevel = "5.1" + [string]$MaxFeatureLevel = "6.1" ) function ConvertSnakeToCamelCase($SnakeCaseName) @@ -154,12 +154,11 @@ function WriteOperatorFunction($Operator) { $Cpp += " desc->$($Field.Name) = AsPointer(ParseUInt32ArrayField(value, `"$($Field.Name)`", allocator, $Required));" } - elseif ($Field.Type -eq "operatorDesc") + elseif ($Field.Type -eq "fusedActivationOperatorDesc") { - $Fused = if ($Field.Name -eq 'FusedActivation') { 'true' } else { 'false' } - $Cpp += " desc->$($Field.Name) = ParseDmlOperatorDescField(value, `"$($Field.Name)`", $Fused, allocator, $Required);" + $Cpp += " desc->$($Field.Name) = ParseDmlOperatorDescField(value, `"$($Field.Name)`", true, allocator, $Required);" } - elseif ($Field.Type -eq "operatorDescArray") + elseif ($Field.Type -eq "fusedActivationOperatorDescArray") { $Cpp += " desc->$($Field.Name) = AsPointer(ParseDmlOperatorDescArrayField(value, `"$($Field.Name)`", true, allocator, $Required));" } @@ -237,7 +236,10 @@ $Cpp += "// $('='*100)" $Cpp += "" foreach ($Enum in $Schema.ApiEnums) { - $Cpp += WriteEnumParser $Enum + if (!$Enum.private) + { + $Cpp += WriteEnumParser $Enum + } } $Cpp += "// $('='*100)" diff --git a/docs/CompVis_Stable_Diffusion_Instructions.md b/docs/CompVis_Stable_Diffusion_Instructions.md deleted file mode 100644 index 4c21e698..00000000 --- a/docs/CompVis_Stable_Diffusion_Instructions.md +++ /dev/null @@ -1,102 +0,0 @@ -# Running CompVis Stable Diffusion on a Single GPU with ONNX Runtime and DirectML - -These instructions download and set up the CompVis Stable Diffusion v1.4 model through the Hugging Face diffusers and transformers library. It pulls relevant Python packages that allow the model to run on most discrete consumer graphics GPUs with ONNX Runtime atop the DirectML execution provider. These instructions are based on the prior work of [Neil McAlister](https://www.travelneil.com/stable-diffusion-windows-amd.html) with the more up-to-date script version from the Hugging Face Diffusers repo and its dependency packages, as well as additional conversion steps for better execution performance. - -## Overview - -The figure below provides an overview of the components involved in the model conversion process: - -![Stable Diffusion Conversion](sd_conversion.png) - -- [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) : the original stable diffusion model source -- [Hugging Face](https://huggingface.co/) : provides APIs, packages, tools, and pretrained models for a wide variety of scenarios including stable diffusion - - (Python package) [diffusers](https://github.com/huggingface/diffusers) : offers APIs/scripts for diffusion models - - (Python package) [transformers](https://github.com/huggingface/transformers) : offers APIs/scripts for transformer models - - (Python script) [convert_stable_diffusion_checkpoint_to_onnx.py](https://github.com/HuggingFace/diffusers/blob/main/scripts/convert_stable_diffusion_checkpoint_to_onnx.py) : converts PyTorch implementation of stable diffusion to ONNX models -- (Python packages) PyTorch ([torch](https://pypi.org/project/torch/) and [torchvision](https://pypi.org/project/torchvision/), and [torchaudio](https://pypi.org/project/torchaudio/)) : the ML framework used for stable diffusion models -- (Python package) [onnx](https://pypi.org/project/onnx/) : for creating ONNX representations of ML models -- (Python package) [onnxruntime-directml](https://pypi.org/project/onnxruntime-directml/) : for running ONNX models with DirectML support - -## Installing Dependency Packages - -We need a few Python packages, namely the Hugging Face script libraries for transformers and diffusers along with ONNX Runtime for DirectML. - -``` -pip install diffusers transformers onnxruntime-directml onnx accelerate -``` - -You will also need PyTorch installed to run the Hugging Face model conversion script (`convert_stable_diffusion_checkpoint_to_onnx.py`). By default, the conversion script will output FP32 ONNX models. FP16 models consume less memory and may be faster depending on your GPU. However, **as of this writing, the Hugging Face stable diffusion to ONNX conversion script only supports FP16 if you have PyTorch with CUDA**. This will require up to 3 GB of additional disk space. - -**If you have a CUDA-capable graphics card**: -``` -pip install torch==1.13.1+cu116 torchaudio==0.13.1+cu116 torchvision==0.14.1 --index-url https://download.pytorch.org/whl/cu116 -``` - -**If you do not have a CUDA-capable graphics card**:: -``` -pip install torch==1.13.1 torchaudio==0.13.1 torchvision==0.14.1 -``` - -**⚠️ WARNING ⚠️** : Conversion to ONNX with PyTorch 2.0 does not currently work. If you encounter the following error, please make sure that you are using PyTorch 1.13.1 or older. See https://github.com/pytorch/pytorch/issues/96944. - - aten::scaled_dot_product_attention' to ONNX opset version 14 is not supported - -### Hardware Requirement -Since the entire model must fit within GPU memory while executing, the GPU should have at least 8GB of VRAM available to run this model. Here are a few examples: -- NVIDIA GeForce RTX 2070 or later -- AMD Radeon RX 6500 XT (8GB) or later -- Intel Arc A750 Graphics or later - -## Downloading the Model -We first need to download the model from [Hugging Face](https://huggingface.co/), for which you need an account. So if you haven't created one, now is the time. Once you've set up a Hugging Face account, generate an [access token](https://huggingface.co/docs/hub/security-tokens) (just follow their instructions in the web site). - -Once you have the account and an access token, authenticate yourself in a terminal or powershell console by running the following command. - -``` -huggingface-cli.exe login -``` - -It'll ask for your access token, which you can find on your account profile `Settings -> Access Tokens`, just copy it from here and carefully paste it on this prompt. Note that you won't see anything appear on the prompt when you paste it, that's fine. It's there already, just hit Enter. You'll start downloading the model from Hugging Face. - -## Converting to ONNX - -The model is trained with PyTorch so it can naturally convert to ONNX. Since we'll be using DirectML through ONNX Runtime, this step is needed. The script `convert_stable_diffusion_checkpoint_to_onnx.py`, which you will use here is just a local copy of the same file from the [Hugging Face diffusers GitHub repo](https://github.com/HuggingFace/diffusers/blob/main/scripts/convert_stable_diffusion_checkpoint_to_onnx.py). In case you don't want to clone that entire repo, just copy the file over. - -``` -python convert_stable_diffusion_checkpoint_to_onnx.py --model_path="CompVis/stable-diffusion-v1-4" --output_path="./stable_diffusion_onnx" --fp16 -``` - -This will run the conversion and put the result ONNX files under the `stable_diffusion_onnx` folder. For better performance, we recommend you convert the model to half-precision floating point data type using the `--fp16` option (as mentioned earlier, you must have PyTorch with CUDA support to use `--fp16`). - -## Running the ONNX Model - -You'll need a script that looks like what follows. On an NVIDIA GeForce RTX 2070, a single image currently takes about 20 seconds to generate from a prompt. It'll take between 5-10 mins on a CPU. - -```python -# (test/run.py) -from diffusers import OnnxStableDiffusionPipeline -pipe = OnnxStableDiffusionPipeline.from_pretrained("./stable_diffusion_onnx", provider="DmlExecutionProvider") -prompt = "a photo of an astronaut riding a horse on mars." -image = pipe(prompt).images[0] -image.save("./result.png") -``` - -### A Debugging Note -When running this script inside VSCode, the relative path specified here is relative to the base location of your project folder and not the location of your script file. To fix that up, configure the `cwd` (i.e. "current working directory") option in your launch.json file as follows: - -```json - // .vscode/launch.json - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "cwd": "${workspaceFolder}/test/", - "console": "integratedTerminal", - "justMyCode": true - } - ] -``` - -If you have an NVIDIA graphics card and want to try running the ONNX model on CUDA, just replace the `onnxruntime-directml` package with the `onnxruntime-gpu` package. Do not keep them both. Then replace the `"DmlExecutionProvider"` name in the running script `run.py` with `"CUDAExecutionProvider"`. You may need to install NVIDIA CUDA libraries separately. diff --git a/docs/sd_conversion.png b/docs/sd_conversion.png deleted file mode 100644 index 1cbbfcda..00000000 Binary files a/docs/sd_conversion.png and /dev/null differ