diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 3153511330451..eecbc204f8b76 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -5,21 +5,37 @@
 
 * com.microsoft
   * <a href="#com.microsoft.AttnLSTM">com.microsoft.AttnLSTM</a>
+  * <a href="#com.microsoft.CDist">com.microsoft.CDist</a>
   * <a href="#com.microsoft.ConvTransposeWithDynamicPads">com.microsoft.ConvTransposeWithDynamicPads</a>
   * <a href="#com.microsoft.CropAndResize">com.microsoft.CropAndResize</a>
+  * <a href="#com.microsoft.DequantizeLinear">com.microsoft.DequantizeLinear</a>
   * <a href="#com.microsoft.ExpandDims">com.microsoft.ExpandDims</a>
   * <a href="#com.microsoft.FusedConv">com.microsoft.FusedConv</a>
   * <a href="#com.microsoft.FusedGemm">com.microsoft.FusedGemm</a>
   * <a href="#com.microsoft.GatherND">com.microsoft.GatherND</a>
+  * <a href="#com.microsoft.MatMulInteger16">com.microsoft.MatMulInteger16</a>
   * <a href="#com.microsoft.MaxpoolWithMask">com.microsoft.MaxpoolWithMask</a>
   * <a href="#com.microsoft.MurmurHash3">com.microsoft.MurmurHash3</a>
   * <a href="#com.microsoft.Pad">com.microsoft.Pad</a>
+  * <a href="#com.microsoft.QuantizeLinear">com.microsoft.QuantizeLinear</a>
   * <a href="#com.microsoft.Range">com.microsoft.Range</a>
   * <a href="#com.microsoft.ReduceSumInteger">com.microsoft.ReduceSumInteger</a>
   * <a href="#com.microsoft.SampleOp">com.microsoft.SampleOp</a>
   * <a href="#com.microsoft.Tokenizer">com.microsoft.Tokenizer</a>
   * <a href="#com.microsoft.Unique">com.microsoft.Unique</a>
   * <a href="#com.microsoft.WordConvEmbedding">com.microsoft.WordConvEmbedding</a>
+  * <sub>experimental</sub> <a href="#com.microsoft.Attention">com.microsoft.Attention</a>
+  * <sub>experimental</sub> <a href="#com.microsoft.EmbedLayerNormalization">com.microsoft.EmbedLayerNormalization</a>
+  * <sub>experimental</sub> <a href="#com.microsoft.Gelu">com.microsoft.Gelu</a>
+  * <sub>experimental</sub> <a href="#com.microsoft.SkipLayerNormalization">com.microsoft.SkipLayerNormalization</a>
+* com.microsoft.nchwc
+  * <a href="#com.microsoft.nchwc.AveragePool">com.microsoft.nchwc.AveragePool</a>
+  * <a href="#com.microsoft.nchwc.Conv">com.microsoft.nchwc.Conv</a>
+  * <a href="#com.microsoft.nchwc.GlobalAveragePool">com.microsoft.nchwc.GlobalAveragePool</a>
+  * <a href="#com.microsoft.nchwc.GlobalMaxPool">com.microsoft.nchwc.GlobalMaxPool</a>
+  * <a href="#com.microsoft.nchwc.MaxPool">com.microsoft.nchwc.MaxPool</a>
+  * <a href="#com.microsoft.nchwc.ReorderInput">com.microsoft.nchwc.ReorderInput</a>
+  * <a href="#com.microsoft.nchwc.ReorderOutput">com.microsoft.nchwc.ReorderOutput</a>
 
 ## com.microsoft
 ### <a name="com.microsoft.AttnLSTM"></a><a name="com.microsoft.attnlstm">**com.microsoft.AttnLSTM**</a>
@@ -231,6 +247,43 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.CDist"></a><a name="com.microsoft.cdist">**com.microsoft.CDist**</a>
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>metric</tt> : string</dt>
+<dd>The distance metric to use. If a string, the distance function can be "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean", "hamming", "jaccard", "jensenshannon", "kulsinski", "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "wminkowski", "yule".</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>A</tt> : T</dt>
+<dd>2D matrix with shape (M,N)</dd>
+<dt><tt>B</tt> : T</dt>
+<dd>2D matrix with shape (K,N)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>C</tt> : T</dt>
+<dd>A 2D Matrix that represents the distance between each pair of the two collections of inputs.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(double)</dt>
+<dd>Constrains input to only numeric types.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.ConvTransposeWithDynamicPads"></a><a name="com.microsoft.convtransposewithdynamicpads">**com.microsoft.ConvTransposeWithDynamicPads**</a>
 
 #### Version
@@ -334,6 +387,51 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.DequantizeLinear"></a><a name="com.microsoft.dequantizelinear">**com.microsoft.DequantizeLinear**</a>
+
+  The linear dequantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data. 
+  The dequantization formula is y = (x - x_zero_point) * x_scale. 
+  Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int</dt>
+<dd>The axis along which same quantization parameters are applied. It's optional.If it's not specified, it means per-tensor quantization and input 'x_scale' and 'x_zero_point' must be scalars.If it's specified, it means per 'axis' quantization and input 'x_scale' and 'x_zero_point' must be 1-D tensors.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>x</tt> : T2</dt>
+<dd>N-D quantized Input tensor to be de-quantized.</dd>
+<dt><tt>x_scale</tt> : T1</dt>
+<dd>Scale for input 'x'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-axis quantization.If it's a 1-D tensor, its number of elements should be equal to the dimension value of 'axis' dimension of input 'x'.</dd>
+<dt><tt>x_zero_point</tt> : T2</dt>
+<dd>Zero point for input 'x'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-axis quantization.If it's a 1-D tensor, its number of elements should be equal to the dimension value of 'axis' dimension of input 'x'.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>y</tt> : T1</dt>
+<dd>N-D full precision output tensor. It has same shape as input 'x'.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float)</dt>
+<dd>Constrain 'y', 'x_scale' to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(int8), tensor(uint8)</dt>
+<dd>Constrain 'x_zero_point' and 'x' to 8-bit integer tensors.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.ExpandDims"></a><a name="com.microsoft.expanddims">**com.microsoft.ExpandDims**</a>
 
   ExpandDims echo operator.
@@ -380,7 +478,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>activation</tt> : string</dt>
 <dd></dd>
-<dt><tt>alpha</tt> : float</dt>
+<dt><tt>activation_params</tt> : list of floats</dt>
 <dd></dd>
 <dt><tt>auto_pad</tt> : string</dt>
 <dd></dd>
@@ -525,6 +623,43 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.MatMulInteger16"></a><a name="com.microsoft.matmulinteger16">**com.microsoft.MatMulInteger16**</a>
+
+  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+   The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>A</tt> : T1</dt>
+<dd>N-dimensional matrix A</dd>
+<dt><tt>B</tt> : T2</dt>
+<dd>N-dimensional matrix B</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T3</dt>
+<dd>Matrix multiply results from A * B</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(int16), tensor(uint16)</dt>
+<dd>Constrain input A data types as 16-bit integer tensor</dd>
+<dt><tt>T2</tt> : tensor(int16), tensor(uint16)</dt>
+<dd>Constrain input B data types as 16-bit integer tensor</dd>
+<dt><tt>T3</tt> : tensor(int32), tensor(uint32)</dt>
+<dd>Constrain output Y data types as 32-bit integer tensor.T3 must be tensor(uint32) when both T1 and T2 are tensor(uint16),or must be tensor(int32) when either T1 or T2 is tensor(int16).</dd>
+</dl>
+
+
 ### <a name="com.microsoft.MaxpoolWithMask"></a><a name="com.microsoft.maxpoolwithmask">**com.microsoft.MaxpoolWithMask**</a>
 
   For internal use.
@@ -670,6 +805,51 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.QuantizeLinear"></a><a name="com.microsoft.quantizelinear">**com.microsoft.QuantizeLinear**</a>
+
+  The linear quantization operator. It consumes a full precision data, a scale, a zero point and computes the quantized data. 
+  The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale), it computes the nearest integer value to arg (in floating-point format), 
+   rounding halfway cases away from zero. Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int</dt>
+<dd>The axis along which same quantization parameters are applied. It's optional.If it's not specified, it means per-tensor quantization and input 'x_scale' and 'x_zero_point' must be scalars.If it's specified, it means per 'axis' quantization and input 'x_scale' and 'x_zero_point' must be 1-D tensors.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>x</tt> : T1</dt>
+<dd>N-D full precision Input tensor to be quantized.</dd>
+<dt><tt>y_scale</tt> : T1</dt>
+<dd>Scale for doing quantization to get 'y'. It could be a scalar or a 1-D tensor,which means a per-tensor or per-axis quantization. If it's a 1-D tensor, its number of elements should be equal to the dimension value of 'axis' dimension of input 'x'.</dd>
+<dt><tt>y_zero_point</tt> : T2</dt>
+<dd>Zero point for doing quantization to get 'y'. It could be a scalar or a 1-D tensor, which means a per-tensoror per-axis quantization. If it's a 1-D tensor, its number of elements should be equal to the dimension value of 'axis' dimension of input 'x'.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>y</tt> : T2</dt>
+<dd>N-D quantized output tensor. It has same shape as input 'x'.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float)</dt>
+<dd>Constrain 'x', 'y_scale' to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(int8), tensor(uint8)</dt>
+<dd>Constrain 'y_zero_point' and 'y' to 8-bit integer tensors.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.Range"></a><a name="com.microsoft.range">**com.microsoft.Range**</a>
 
   Creates a sequence of numbers that begins at `start` and extends by increments of `delta`
@@ -855,11 +1035,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 ### <a name="com.microsoft.Unique"></a><a name="com.microsoft.unique">**com.microsoft.Unique**</a>
 
-  Finds all the unique values (deduped list) present in the given input tensor. 
-                This operator returns 3 outputs. 
-                The first output tensor 'uniques' contains all of the unique elements of the input, 
+  Finds all the unique values (deduped list) present in the given input tensor.
+                This operator returns 3 outputs.
+                The first output tensor 'uniques' contains all of the unique elements of the input,
                 sorted in the same order that they occur in the input.
-                The second output tensor 'idx' is the same size as the input and it contains the index 
+                The second output tensor 'idx' is the same size as the input and it contains the index
                 of each value of the input in 'uniques'.
                 The third output tensor 'counts' contains the count of each element of 'uniques' in the input.
                 Example:
@@ -948,3 +1128,441 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <sub>experimental</sub> <a name="com.microsoft.Attention"></a><a name="com.microsoft.attention">**com.microsoft.Attention**</a>
+
+  Multi-Head Self Attention
+
+#### Version
+
+No versioning maintained for experimental ops.
+#### Attributes
+
+<dl>
+<dt><tt>num_heads</tt> : int (required)</dt>
+<dd>Number of attention heads</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>3D input tensor with shape (batch_size, sequence_length, hidden_size), hidden_size = num_heads * head_size</dd>
+<dt><tt>weight</tt> : T</dt>
+<dd>2D input tensor with shape (hidden_size, 3 * hidden_size)</dd>
+<dt><tt>bias</tt> : T</dt>
+<dd>1D input tensor with shape (3 * hidden_size)</dd>
+<dt><tt>mask_index</tt> : M</dt>
+<dd>Attention mask index with shape (batch_size)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>M</tt> : tensor(int32)</dt>
+<dd>Constrain mask index to integer types</dd>
+</dl>
+
+
+### <sub>experimental</sub> <a name="com.microsoft.EmbedLayerNormalization"></a><a name="com.microsoft.embedlayernormalization">**com.microsoft.EmbedLayerNormalization**</a>
+
+  Embedding Layer Normalization
+
+#### Version
+
+No versioning maintained for experimental ops.
+#### Inputs
+
+<dl>
+<dt><tt>input_ids</tt> : T1</dt>
+<dd>2D words IDs with shape (batch_size, sequence_length)</dd>
+<dt><tt>segment_ids</tt> : T1</dt>
+<dd>2D segment IDs with shape (batch_size, sequence_length)</dd>
+<dt><tt>mask</tt> : T1</dt>
+<dd>2D attention mask with shape (batch_size, sequence_length)</dd>
+<dt><tt>word_embedding</tt> : T</dt>
+<dd>2D with shape (,hidden_size)</dd>
+<dt><tt>position_embedding</tt> : T</dt>
+<dd>2D with shape (, hidden_size)</dd>
+<dt><tt>segment_embedding</tt> : T</dt>
+<dd>2D with shape (, hidden_size)</dd>
+<dt><tt>gamma</tt> : T</dt>
+<dd>1D gamma tensor for layer normalization with shape (hidden_size)</dd>
+<dt><tt>beta</tt> : T</dt>
+<dd>1D beta tensor for layer normalization  with shape (hidden_size)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>mask_index</tt> : T1</dt>
+<dd>1D mask_index tensor with shape (batch_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(int32)</dt>
+<dd>Constrain input and output integer tensors types</dd>
+<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output float tensors types.</dd>
+</dl>
+
+
+### <sub>experimental</sub> <a name="com.microsoft.Gelu"></a><a name="com.microsoft.gelu">**com.microsoft.Gelu**</a>
+
+  Gelu
+
+#### Version
+
+No versioning maintained for experimental ops.
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd>The input data as Tensor.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd>The output.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+
+### <sub>experimental</sub> <a name="com.microsoft.SkipLayerNormalization"></a><a name="com.microsoft.skiplayernormalization">**com.microsoft.SkipLayerNormalization**</a>
+
+  Skip and Layer Normalization Fusion
+
+#### Version
+
+No versioning maintained for experimental ops.
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>3D input tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>skip</tt> : T</dt>
+<dd>3D skip tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>gamma</tt> : T</dt>
+<dd>1D input tensor with shape (hidden_size)</dd>
+<dt><tt>beta</tt> : T</dt>
+<dd>1D skip tensor with shape (hidden_size</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output types to float or half tensors.</dd>
+</dl>
+
+
+## com.microsoft.nchwc
+### <a name="com.microsoft.nchwc.AveragePool"></a><a name="com.microsoft.nchwc.averagepool">**com.microsoft.nchwc.AveragePool**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string</dt>
+<dd></dd>
+<dt><tt>ceil_mode</tt> : int</dt>
+<dd></dd>
+<dt><tt>count_include_pad</tt> : int</dt>
+<dd></dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd></dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd></dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float)</dt>
+<dd>Constrain input and output types to float tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.Conv"></a><a name="com.microsoft.nchwc.conv">**com.microsoft.nchwc.Conv**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation</tt> : string</dt>
+<dd></dd>
+<dt><tt>activation_params</tt> : list of floats</dt>
+<dd></dd>
+<dt><tt>auto_pad</tt> : string</dt>
+<dd></dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>group</tt> : int</dt>
+<dd></dd>
+<dt><tt>kernel_shape</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd></dd>
+</dl>
+
+#### Inputs (2 - 4)
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+<dt><tt>W</tt> : T</dt>
+<dd></dd>
+<dt><tt>B</tt> (optional) : T</dt>
+<dd></dd>
+<dt><tt>Sum</tt> (optional) : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float)</dt>
+<dd>Constrain input and output types to float tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.GlobalAveragePool"></a><a name="com.microsoft.nchwc.globalaveragepool">**com.microsoft.nchwc.GlobalAveragePool**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float)</dt>
+<dd>Constrain input and output types to float tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.GlobalMaxPool"></a><a name="com.microsoft.nchwc.globalmaxpool">**com.microsoft.nchwc.GlobalMaxPool**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float)</dt>
+<dd>Constrain input and output types to float tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.MaxPool"></a><a name="com.microsoft.nchwc.maxpool">**com.microsoft.nchwc.MaxPool**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string</dt>
+<dd></dd>
+<dt><tt>ceil_mode</tt> : int</dt>
+<dd></dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd></dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd></dd>
+<dt><tt>storage_order</tt> : int</dt>
+<dd></dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd></dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float)</dt>
+<dd>Constrain input and output types to float tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.ReorderInput"></a><a name="com.microsoft.nchwc.reorderinput">**com.microsoft.nchwc.ReorderInput**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(int8), tensor(uint8)</dt>
+<dd>Constrain input and output types to float/quantized tensors</dd>
+</dl>
+
+
+### <a name="com.microsoft.nchwc.ReorderOutput"></a><a name="com.microsoft.nchwc.reorderoutput">**com.microsoft.nchwc.ReorderOutput**</a>
+
+  For internal use.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft.nchwc' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>channels</tt> : int</dt>
+<dd></dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd></dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(int8), tensor(uint8)</dt>
+<dd>Constrain input and output types to float/quantized tensors</dd>
+</dl>
+
+
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 2cad94aae0481..696e77ac23f9b 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -10,165 +10,211 @@
 | Op Name | Parameters | OpSet Version | Types Supported |
 |---------|------------|---------------|-----------------|
 **Operator Domain:** *ai.onnx.ml*
-|Abs|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+|Abs|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(int64), tensor(double), tensor(uint32), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown|
 |Acos|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
 |Acosh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
-|Add|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Add|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 |Affine|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |And|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
 | | ||**T1** = tensor(bool)|
-|ArgMax|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(int32), tensor(float)|
-|ArgMin|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(int32), tensor(float)|
-|ArrayFeatureExtractor|(*in* X:**T**, *in* Y:**tensor(int64)**, *out* Z:**T**)|1+|**T** = tensor(string), tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|ArgMax|(*in* data:**T**, *out* reduced:**tensor(int64)**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ArgMin|(*in* data:**T**, *out* reduced:**tensor(int64)**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ArrayFeatureExtractor|(*in* X:**T**, *in* Y:**tensor(int64)**, *out* Z:**T**)|1+|**T** = tensor(string), tensor(int64), tensor(float), tensor(int32), tensor(double)|
 |Asin|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
 |Asinh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
 |Atan|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
 |Atanh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
-|AveragePool|(*in* X:**T**, *out* Y:**T**)|10+|**T** = tensor(float)|
+|AveragePool|(*in* X:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
+| | |[10, 10]|**T** = tensor(float)|
 | | |[7, 9]|**T** = tensor(float)|
-|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|[7, 9]|**B** = tensor(float)|
+|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|9+|**B** = tensor(float)|
+| | ||**X** = tensor(float)|
+| | ||**mean** = tensor(float)|
+| | ||**scale** = tensor(float)|
+| | ||**var** = tensor(float)|
+| | |[7, 8]|**B** = tensor(float)|
 | | ||**X** = tensor(float)|
 | | ||**mean** = tensor(float)|
 | | ||**scale** = tensor(float)|
 | | ||**var** = tensor(float)|
 |Binarizer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|BitShift|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|11+|**T** = tensor(uint8), tensor(uint32), tensor(uint64)|
 |Cast|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(string)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | |[6, 9]|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[6, 9]|**T1** = tensor(MLFloat16), tensor(int64), tensor(float), tensor(uint32), tensor(double), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown, tensor(bool)|
+| | ||**T2** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 |CastMap|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = unknown|
-| | ||**T2** = tensor(string), tensor(float), tensor(int64)|
+| | ||**T2** = tensor(int64), tensor(string), tensor(float)|
 |CategoryMapper|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = tensor(string), tensor(int64)|
 | | ||**T2** = tensor(string), tensor(int64)|
 |Ceil|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Clip|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
-|Compress|(*in* input:**T**, *in* condition:**T1**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Clip|(*in* input:**T**, *in* min:**T**, *in* max:**T**, *out* output:**T**) or (*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(float)|
+| | |[6, 10]|**T** = tensor(float)|
+|Compress|(*in* input:**T**, *in* condition:**T1**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 | | ||**T1** = tensor(bool)|
-|Concat|(*in* inputs:**T**, *out* concat_result:**T**)|4+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[9, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|Concat|(*in* inputs:**T**, *out* concat_result:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[4, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|ConcatFromSequence|(*in* input_sequence:**S**, *out* concat_result:**T**)|11+|**S** = unknown|
 |ConstantOfShape|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int64)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+| | ||**T2** = tensor(MLFloat16), tensor(float), tensor(int64), tensor(double), tensor(uint32), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown, tensor(bool)|
+|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
 |ConvInteger|(*in* x:**T1**, *in* w:**T2**, *in* x_zero_point:**T1**, *in* w_zero_point:**T2**, *out* y:**T3**)|10+|**T1** = tensor(uint8)|
 | | ||**T2** = tensor(uint8)|
 | | ||**T3** = tensor(int32)|
-|ConvTranspose|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|ConvTranspose|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
 |Cos|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
 |Cosh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
 |Crop|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|DepthToSpace|(*in* input:**T**, *out* output:**T**)|[1, 4]|**T** = tensor(float)|
+|CumSum|(*in* x:**T**, *in* axis:**T2**, *out* y:**T**)|11+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+|DepthToSpace|(*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
 |DequantizeLinear|(*in* x:**T**, *in* x_scale:**tensor(float)**, *in* x_zero_point:**T**, *out* y:**tensor(float)**)|10+|**x** = tensor(uint8), unknown|
 | | ||**x_scale** = tensor(float)|
 | | ||**x_zero_point** = tensor(uint8), unknown|
 | | ||**y** = tensor(float)|
+|Det|(*in* X:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
 |DictVectorizer|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = unknown|
-| | ||**T2** = tensor(string), tensor(float), tensor(int64), tensor(double)|
-|Div|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
-|Dropout|(*in* data:**T**, *out* output:**T**, *out* mask:**T**) or (*in* data:**T**, *out* output:**T**, *out* mask:**T1**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T2** = tensor(float), tensor(double), tensor(string), tensor(int64)|
+|Div|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+|Dropout|(*in* data:**T**, *out* output:**T**, *out* mask:**T1**) or (*in* data:**T**, *out* output:**T**, *out* mask:**T**)|10+|**T** = tensor(MLFloat16), tensor(double), tensor(float)|
 | | ||**T1** = tensor(bool)|
-| | |[7, 9]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[7, 9]|**T** = tensor(MLFloat16), tensor(double), tensor(float)|
 | | ||**T1** = tensor(bool)|
-|DynamicSlice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
+|DynamicQuantizeLinear|(*in* x:**T1**, *out* y:**T2**, *out* y_scale:**tensor(float)**, *out* y_zero_point:**T2**)|11+|**T2** = tensor(uint8)|
+|DynamicSlice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *out* output:**T**)|1+|**T** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
 |Elu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Equal|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|11+|**T** = tensor(float)|
+|Equal|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|11+|**T** = tensor(int64), tensor(float), tensor(bool), tensor(int32)|
 | | ||**T1** = tensor(bool)|
-| | |7+|**T** = tensor(int32), tensor(bool), tensor(int64)|
+| | |[7, 10]|**T** = tensor(int64), tensor(bool), tensor(int32)|
 | | ||**T1** = tensor(bool)|
 |Erf|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
-|Exp|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(double)|
-|Expand|(*in* input:**T**, *in* shape:**tensor(int64)**, *out* output:**T**)|8+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|EyeLike|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(uint64), tensor(int32), tensor(float), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(uint64), tensor(int32), tensor(float), tensor(int64), tensor(double)|
-|FeatureVectorizer|(*in* X:**T1**, *out* Y:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
-|Flatten|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | |[1, 8]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Exp|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(double), tensor(float)|
+|Expand|(*in* input:**T**, *in* shape:**tensor(int64)**, *out* output:**T**)|8+|**T** = tensor(MLFloat16), tensor(float), tensor(int64), tensor(double), tensor(uint32), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown, tensor(bool)|
+|EyeLike|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int64), tensor(float), tensor(int32), tensor(uint64), tensor(double)|
+| | ||**T2** = tensor(int64), tensor(float), tensor(int32), tensor(uint64), tensor(double)|
+|FeatureVectorizer|(*in* X:**T1**, *out* Y:**tensor(float)**)|1+|**T1** = tensor(int64), tensor(float), tensor(double), tensor(int32)|
+|Flatten|(*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[1, 8]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[9, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 |Floor|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|GRU|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(double)|
+|GRU|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(double), tensor(float)|
 | | ||**T1** = tensor(int32)|
-|Gather|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|[7, 9]|**T** = tensor(float)|
+|Gather|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+| | |[1, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|GatherElements|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|GatherND|(*in* data:**T**, *in* indices:**tensor(int64)**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64)|
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
+| | |[7, 8]|**T** = tensor(float)|
+| | |[9, 10]|**T** = tensor(float)|
 |GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |GlobalLpPool|(*in* X:**T**, *out* Y:**T**)|2+|**T** = tensor(float)|
 |GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|Greater|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(int64)|
+|Greater|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int64), tensor(int32)|
 | | ||**T1** = tensor(bool)|
 | | |[7, 9]|**T** = tensor(float)|
 | | ||**T1** = tensor(bool)|
 |HardSigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Hardmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Identity|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|If|(*in* cond:**B**, *out* outputs:**V**)|1+|**B** = tensor(bool)|
-| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Hardmax|(*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
+|Identity|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|If|(*in* cond:**B**, *out* outputs:**V**)|11+|**B** = tensor(bool)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[1, 10]|**B** = tensor(bool)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 |ImageScaler|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Imputer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(int64)|
+|Imputer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(int64), tensor(float)|
 |InstanceNormalization|(*in* input:**T**, *in* scale:**T**, *in* B:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
-|IsInf|(*in* X:**T1**, *out* Y:**T2**)|10+|**T1** = tensor(float), tensor(double)|
+|IsInf|(*in* X:**T1**, *out* Y:**T2**)|10+|**T1** = tensor(double), tensor(float)|
 | | ||**T2** = tensor(bool)|
-|IsNaN|(*in* X:**T1**, *out* Y:**T2**)|9+|**T1** = tensor(float), tensor(MLFloat16)|
+|IsNaN|(*in* X:**T1**, *out* Y:**T2**)|9+|**T1** = tensor(MLFloat16), tensor(float)|
 | | ||**T2** = tensor(bool)|
 |LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|LSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|7+|**T** = tensor(float), tensor(double)|
+|LSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|7+|**T** = tensor(double), tensor(float)|
 | | ||**T1** = tensor(int32)|
-|LabelEncoder|(*in* X:**T1**, *out* Y:**T2**)|2+|**T1** = tensor(string), tensor(float), tensor(int64)|
-| | ||**T2** = tensor(string), tensor(float), tensor(int64)|
+|LabelEncoder|(*in* X:**T1**, *out* Y:**T2**)|2+|**T1** = tensor(int64), tensor(string), tensor(float)|
+| | ||**T2** = tensor(int64), tensor(string), tensor(float)|
 | | |[1, 1]|**T1** = tensor(string), tensor(int64)|
 | | ||**T2** = tensor(string), tensor(int64)|
+|LayerNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *out* Y:**T**, *out* mean:**U**, *out* inv_std_var:**U**)|1+|**T** = tensor(double), tensor(float)|
 |LeakyRelu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Less|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(int64)|
+|Less|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int64), tensor(int32)|
 | | ||**T1** = tensor(bool)|
-| | |[7, 9]|**T** = tensor(float)|
+| | |[7, 9]|**T** = tensor(double), tensor(float)|
 | | ||**T1** = tensor(bool)|
-|LinearClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|LinearClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 | | ||**T2** = tensor(string), tensor(int64)|
 |LinearRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
 |Log|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
-|LogSoftmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Loop|(*in* M:**I**, *in* cond:**B**, *in* v_initial:**V**, *out* v_final_and_scan_outputs:**V**)|1+|**B** = tensor(bool)|
+|LogSoftmax|(*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
+|Loop|(*in* M:**I**, *in* cond:**B**, *in* v_initial:**V**, *out* v_final_and_scan_outputs:**V**)|11+|**B** = tensor(bool)|
+| | ||**I** = tensor(int64)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[1, 10]|**B** = tensor(bool)|
 | | ||**I** = tensor(int64)|
-| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 |LpNormalization|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|LpPool|(*in* X:**T**, *out* Y:**T**)|2+|**T** = tensor(float)|
-|MatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|[1, 9]|**T** = tensor(float), tensor(double)|
-| | |[9, 9]|**T** = tensor(uint64), tensor(int32), tensor(int64), tensor(uint32)|
+|LpPool|(*in* X:**T**, *out* Y:**T**)|11+|**T** = tensor(float)|
+| | |[2, 10]|**T** = tensor(float)|
+|MatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|9+|**T** = tensor(int64), tensor(float), tensor(uint32), tensor(int32), tensor(uint64), tensor(double)|
+| | |[1, 8]|**T** = tensor(double), tensor(float)|
 |MatMulInteger|(*in* A:**T1**, *in* B:**T2**, *in* a_zero_point:**T1**, *in* b_zero_point:**T2**, *out* Y:**T3**)|10+|**T1** = tensor(uint8)|
-| | ||**T2** = tensor(uint8)|
+| | ||**T2** = tensor(uint8), unknown|
 | | ||**T3** = tensor(int32)|
-|Max|(*in* data_0:**T**, *out* max:**T**)|8+|**T** = tensor(float), tensor(double)|
+|Max|(*in* data_0:**T**, *out* max:**T**)|8+|**T** = tensor(double), tensor(float)|
 | | |[6, 7]|**T** = tensor(float)|
-|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|10+|**I** = tensor(int64)|
+|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|11+|**I** = tensor(int64)|
 | | ||**T** = tensor(float)|
 | | |[1, 7]|**T** = tensor(float)|
+| | |[10, 10]|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
 | | |[8, 9]|**I** = tensor(int64)|
 | | ||**T** = tensor(float)|
 |MaxRoiPool|(*in* X:**T**, *in* rois:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|MaxUnpool|(*in* X:**T1**, *in* I:**T2**, *in* output_shape:**T2**, *out* output:**T1**)|9+|**T1** = tensor(float)|
+|MaxUnpool|(*in* X:**T1**, *in* I:**T2**, *in* output_shape:**T2**, *out* output:**T1**)|11+|**T1** = tensor(float)|
+| | ||**T2** = tensor(int64)|
+| | |[9, 10]|**T1** = tensor(float)|
 | | ||**T2** = tensor(int64)|
 |Mean|(*in* data_0:**T**, *out* mean:**T**)|8+|**T** = tensor(float)|
 | | |[6, 7]|**T** = tensor(float)|
-|MeanVarianceNormalization|(*in* X:**T**, *out* Y:**T**) or (*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|MeanVarianceNormalization|(*in* input:**T**, *out* output:**T**) or (*in* X:**T**, *out* Y:**T**)|9+|**T** = tensor(float)|
 | | |[1, 8]|**T** = tensor(float)|
 |Min|(*in* data_0:**T**, *out* min:**T**)|8+|**T** = tensor(float)|
 | | |[6, 7]|**T** = tensor(float)|
-|Mod|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|10+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Mul|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Mod|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|10+|**T** = tensor(MLFloat16), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double)|
+|Mul|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 |Multinomial|(*in* input:**T1**, *out* output:**T2**)|7+|**T1** = tensor(float)|
-| | ||**T2** = tensor(int32), tensor(int64)|
-|Neg|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(float), unknown|
-|NonZero|(*in* X:**T**, *out* Y:**tensor(int64)**)|9+|**T** = tensor(int32), tensor(float), tensor(bool), tensor(int64)|
-|Normalizer|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(int64), tensor(int32)|
+|Neg|(*in* X:**T**, *out* Y:**T**)|6+|**T** = unknown, tensor(int32), tensor(double), tensor(float)|
+|NonZero|(*in* X:**T**, *out* Y:**tensor(int64)**)|9+|**T** = tensor(int64), tensor(float), tensor(int32), tensor(uint8), tensor(bool)|
+|Normalizer|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 |Not|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(bool)|
 | | ||**T1** = tensor(bool)|
-|OneHot|(*in* indices:**T1**, *in* depth:**T2**, *in* values:**T3**, *out* output:**T3**)|9+|**T1** = tensor(int32), tensor(float), tensor(int64)|
+|OneHot|(*in* indices:**T1**, *in* depth:**T2**, *in* values:**T3**, *out* output:**T3**)|11+|**T1** = tensor(int32), tensor(float), tensor(int64)|
+| | ||**T2** = tensor(int32), tensor(float), tensor(int64)|
+| | ||**T3** = tensor(int32), tensor(float), tensor(string), tensor(int64)|
+| | |[9, 10]|**T1** = tensor(int32), tensor(float), tensor(int64)|
 | | ||**T2** = tensor(int32), tensor(float), tensor(int64)|
-| | ||**T3** = tensor(string), tensor(int32), tensor(float), tensor(int64)|
-|OneHotEncoder|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(string), tensor(float), tensor(int64), tensor(double)|
+| | ||**T3** = tensor(int32), tensor(float), tensor(string), tensor(int64)|
+|OneHotEncoder|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float), tensor(double), tensor(string), tensor(int64)|
 |Or|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
 | | ||**T1** = tensor(bool)|
 |PRelu|(*in* X:**T**, *in* slope:**T**, *out* Y:**T**)|[7, 9]|**T** = tensor(float)|
-|Pad|(*in* data:**T**, *out* output:**T**)|2+|**T** = tensor(float)|
+|Pad|(*in* data:**T**, *in* pads:**tensor(int64)**, *in* constant_value:**T**, *out* output:**T**) or (*in* data:**T**, *out* output:**T**)|11+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+| | |[2, 10]|**T** = tensor(float)|
 |ParametricSoftplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|Pow|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|7+|**T** = tensor(float), tensor(double)|
+|Pow|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|7+|**T** = tensor(double), tensor(float)|
 |QLinearConv|(*in* x:**T1**, *in* x_scale:**tensor(float)**, *in* x_zero_point:**T1**, *in* w:**T2**, *in* w_scale:**tensor(float)**, *in* w_zero_point:**T2**, *in* y_scale:**tensor(float)**, *in* y_zero_point:**T3**, *in* B:**T4**, *out* y:**T3**)|10+|**T1** = tensor(uint8)|
 | | ||**T2** = tensor(uint8)|
 | | ||**T3** = tensor(uint8)|
@@ -181,108 +227,166 @@
 | | ||**y_zero_point** = tensor(uint8), unknown|
 |RNN|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float)|
 | | ||**T1** = tensor(int32)|
-|RandomNormal|(*out* output:**T**)|1+|**T** = tensor(float), tensor(double)|
-|RandomNormalLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(float), tensor(double)|
-|RandomUniform|(*out* output:**T**)|1+|**T** = tensor(float), tensor(double)|
-|RandomUniformLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(float), tensor(double)|
+|RandomNormal|(*out* output:**T**)|1+|**T** = tensor(double), tensor(float)|
+|RandomNormalLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**T2** = tensor(double), tensor(float)|
+|RandomUniform|(*out* output:**T**)|1+|**T** = tensor(double), tensor(float)|
+|RandomUniformLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**T2** = tensor(double), tensor(float)|
+|Range|(*in* start:**T**, *in* limit:**T**, *in* delta:**T**, *out* output:**T**)|11+|**T** = tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)|
 |Reciprocal|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|ReduceL1|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceL2|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceLogSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceLogSumExp|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceMax|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceMean|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceMin|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceProd|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
-|ReduceSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(double)|
-|ReduceSumSquare|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(double)|
+|ReduceL1|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceL2|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceLogSum|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceLogSumExp|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceMax|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(int64), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(int64), tensor(float)|
+|ReduceMean|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceMin|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(int64), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(int64), tensor(float)|
+|ReduceProd|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(float)|
+|ReduceSum|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+|ReduceSumSquare|(*in* data:**T**, *out* reduced:**T**)|11+|**T** = tensor(int32), tensor(double), tensor(float)|
+| | |[1, 10]|**T** = tensor(int32), tensor(double), tensor(float)|
 |Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Reshape|(*in* data:**T**, *in* shape:**tensor(int64)**, *out* reshaped:**T**) or (*in* data:**T**, *out* reshaped:**T**)|5+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Reshape|(*in* data:**T**, *in* shape:**tensor(int64)**, *out* reshaped:**T**) or (*in* data:**T**, *out* reshaped:**T**)|5+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 | | ||**shape** = tensor(int64)|
-|Reshape_1||[1, 4]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Resize|(*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(float), tensor(uint8)|
-|ReverseSequence|(*in* input:**T**, *in* sequence_lens:**tensor(int64)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|RoiAlign|(*in* X:**T1**, *in* rois:**T1**, *in* batch_indices:**T2**, *out* Y:**T1**)|10+|**T** = tensor(float), tensor(double)|
+|Reshape_1||[1, 4]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|Resize|(*in* X:**T1**, *in* roi:**T2**, *in* scales:**tensor(float)**, *in* sizes:**tensor(int64)**, *out* Y:**T1**) or (*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|11+|**T** = tensor(int32), tensor(uint8), tensor(float)|
+| | |[10, 10]|**T** = tensor(int32), tensor(uint8), tensor(float)|
+|ReverseSequence|(*in* input:**T**, *in* sequence_lens:**tensor(int64)**, *out* Y:**T**)|10+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|RoiAlign|(*in* X:**T1**, *in* rois:**T1**, *in* batch_indices:**T2**, *out* Y:**T1**)|10+|**T** = tensor(double), tensor(float)|
 | | ||**T2** = tensor(int64)|
-|SVMClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Round|(*in* X:**T**, *out* Y:**T**)|11+|**T** = tensor(MLFloat16), tensor(double), tensor(float)|
+|SVMClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 | | ||**T2** = tensor(string), tensor(int64)|
 |SVMRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
 |Scale|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
 |ScaledTanh|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Scaler|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
-|Scan|(*in* sequence_lens:**I**, *in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**) or (*in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**)|9+|**I** = tensor(int64)|
-| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Scaler|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+|Scan|(*in* sequence_lens:**I**, *in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**) or (*in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**)|11+|**I** = tensor(int64)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 | | |[8, 8]|**I** = tensor(int64)|
-| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Scatter|(*in* data:**T**, *in* indices:**Tind**, *in* updates:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[9, 10]|**I** = tensor(int64)|
+| | ||**V** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|Scatter|(*in* data:**T**, *in* indices:**Tind**, *in* updates:**T**, *out* output:**T**)|[9, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|ScatterElements|(*in* data:**T**, *in* indices:**Tind**, *in* updates:**T**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|ScatterND|(*in* data:**T**, *in* indices:**tensor(int64)**, *in* updates:**T**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64)|
 |Selu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Shape|(*in* data:**T**, *out* shape:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|SequenceAt|(*in* input_sequence:**S**, *in* position:**I**, *out* tensor:**T**)|11+|**I** = tensor(int64), tensor(int32)|
+| | ||**S** = unknown|
+| | ||**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|SequenceConstruct|(*in* inputs:**T**, *out* output_sequence:**S**)|11+|**S** = unknown|
+| | ||**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|SequenceEmpty|(*out* output:**S**)|11+|**S** = unknown|
+|SequenceErase|(*in* input_sequence:**S**, *in* position:**I**, *out* output_sequence:**S**)|11+|**I** = tensor(int64), tensor(int32)|
+| | ||**S** = unknown|
+|SequenceInsert|(*in* input_sequence:**S**, *in* tensor:**T**, *in* position:**I**, *out* output_sequence:**S**)|11+|**I** = tensor(int64), tensor(int32)|
+| | ||**S** = unknown|
+|SequenceLength|(*in* input_sequence:**S**, *out* length:**I**)|11+|**I** = tensor(int64)|
+| | ||**S** = unknown|
+|Shape|(*in* data:**T**, *out* shape:**T1**)|1+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 | | ||**T1** = tensor(int64)|
-|Shrink|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Shrink|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double)|
 |Sigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Sign|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Sin|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float), tensor(double)|
+|Sign|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double)|
+|Sin|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(double), tensor(float)|
 |Sinh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
-|Size|(*in* data:**T**, *out* size:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+|Size|(*in* data:**T**, *out* size:**T1**)|1+|**T** = tensor(string), tensor(float), tensor(int64), tensor(double), tensor(uint32), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown, tensor(bool)|
 | | ||**T1** = tensor(int64)|
-|Slice|(*in* data:**T**, *out* output:**T**) or (*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *in* steps:**Tind**, *out* output:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-| | |[1, 9]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Softmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Slice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *in* steps:**Tind**, *out* output:**T**) or (*in* data:**T**, *out* output:**T**)|11+|**T** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+| | |[1, 9]|**T** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[10, 10]|**T** = tensor(MLFloat16), tensor(string), tensor(float), tensor(int64), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|Softmax|(*in* input:**T**, *out* output:**T**)|11+|**T** = tensor(float)|
+| | |[1, 10]|**T** = tensor(float)|
 |Softplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |Softsign|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
 |SpaceToDepth|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Split|(*in* input:**T**, *out* outputs:**T**) or (*in* input:**T**, *in* split:**T**, *out* outputs...:**T**)|2+|**T** = tensor(string), tensor(int32), tensor(float)|
-|Sqrt|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(double)|
-|Squeeze|(*in* data:**T**, *out* squeezed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Split|(*in* input:**T**, *in* split:**T**, *out* outputs...:**T**) or (*in* input:**T**, *out* outputs:**T**)|11+|**T** = tensor(int32), tensor(string), tensor(float)|
+| | |[2, 10]|**T** = tensor(int32), tensor(string), tensor(float)|
+|SplitToSequence|(*in* input:**T**, *in* split:**I**, *out* output_sequence:**S**)|11+|**I** = tensor(int64), tensor(int32)|
+| | ||**S** = unknown|
+| | ||**T** = tensor(int32), tensor(double), tensor(string), tensor(float)|
+|Sqrt|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(double), tensor(float)|
+|Squeeze|(*in* data:**T**, *out* squeezed:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[1, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 |StringNormalizer|(*in* X:**tensor(string)**, *out* Y:**tensor(string)**)|10+|**T** = tensor(string)|
-|Sub|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Sub|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 |Sum|(*in* data_0:**T**, *out* sum:**T**)|8+|**T** = tensor(float)|
 | | |[6, 7]|**T** = tensor(float)|
 |Tan|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
 |Tanh|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
-|TfIdfVectorizer|(*in* X:**T**, *out* Y:**T1**)|9+|**T** = tensor(string), tensor(int32), tensor(int64)|
+|TfIdfVectorizer|(*in* X:**T**, *out* Y:**T1**)|9+|**T** = tensor(int64), tensor(string), tensor(int32)|
 | | ||**T1** = tensor(float)|
-|ThresholdedRelu|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-| | |10+|**T** = tensor(float)|
-|Tile|(*in* input:**T**, *in* tiles:**T**, *in* axis:**T**, *out* output:**T**) or (*in* input:**T**, *in* repeats:**T1**, *out* output:**T**)|6+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+|ThresholdedRelu|(*in* X:**T**, *out* Y:**T**)|10+|**T** = tensor(float)|
+| | |[1, 9]|**T** = tensor(float)|
+|Tile|(*in* input:**T**, *in* tiles:**T**, *in* axis:**T**, *out* output:**T**) or (*in* input:**T**, *in* repeats:**T1**, *out* output:**T**)|6+|**T** = tensor(float), tensor(int64), tensor(double), tensor(uint32), tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), unknown, tensor(bool)|
 | | ||**T1** = tensor(int64)|
-|TopK|(*in* X:**T**, *in* K:**tensor(int64)**, *out* Values:**T**, *out* Indices:**I**) or (*in* X:**T**, *out* Values:**T**, *out* Indices:**I**)|10+|**I** = tensor(int64)|
+|TopK|(*in* X:**T**, *in* K:**tensor(int64)**, *out* Values:**T**, *out* Indices:**I**) or (*in* X:**T**, *out* Values:**T**, *out* Indices:**I**)|11+|**I** = tensor(int64)|
 | | ||**T** = tensor(float)|
 | | |[1, 9]|**I** = tensor(int64)|
 | | ||**T** = tensor(float)|
-|Transpose|(*in* data:**T**, *out* transposed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|TreeEnsembleClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | |[10, 10]|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
+|Transpose|(*in* data:**T**, *out* transposed:**T**)|1+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|TreeEnsembleClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
 | | ||**T2** = tensor(string), tensor(int64)|
-|TreeEnsembleRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
-|Unsqueeze|(*in* data:**T**, *out* expanded:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Upsample|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|[7, 9]|**T** = tensor(int32), tensor(float), tensor(uint8)|
-|Where|(*in* condition:**B**, *in* X:**T**, *in* Y:**T**, *out* output:**T**)|9+|**T** = tensor(string), tensor(int32), tensor(float)|
+|TreeEnsembleRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(int64), tensor(double), tensor(float)|
+|Unique|(*in* X:**T**, *out* Y:**T**, *out* indices:**tensor(int64)**, *out* inverse_indices:**tensor(int64)**, *out* counts:**tensor(int64)**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|Unsqueeze|(*in* data:**T**, *out* expanded:**T**)|11+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | |[1, 10]|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+|Upsample|(*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**)|[7, 9]|**T** = tensor(int32), tensor(uint8), tensor(float)|
+|Where|(*in* condition:**B**, *in* X:**T**, *in* Y:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int64), tensor(string), tensor(float)|
 |Xor|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
 | | ||**T1** = tensor(bool)|
 |ZipMap|(*in* X:**tensor(float)**, *out* Z:**T**)|1+|**T** = unknown|
 | |
 | |
 **Operator Domain:** *com.microsoft*
-|AttnLSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *in* QW:**T**, *in* MW:**T**, *in* V:**T**, *in* M:**T**, *in* memory_seq_lens:**T1**, *in* AW:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|1+|**T** = tensor(float), tensor(double)|
+|AttnLSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *in* QW:**T**, *in* MW:**T**, *in* V:**T**, *in* M:**T**, *in* memory_seq_lens:**T1**, *in* AW:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|1+|**T** = tensor(double), tensor(float)|
 | | ||**T1** = tensor(int32)|
+|CDist|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|1+|**T** = tensor(double), tensor(float)|
 |ConvTransposeWithDynamicPads|(*in* X:**T**, *in* W:**T**, *in* Pads:**tensor(int64)**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |CropAndResize|(*in* X:**T1**, *in* rois:**T1**, *in* batch_indices:**T2**, *in* crop_size:**T2**, *out* Y:**T1**)|1+|**T** = tensor(float)|
 | | ||**T2** = tensor(int32)|
-|ExpandDims|(*in* X:**T**, *in* axis:**tensor(int32)**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|DequantizeLinear|(*in* x:**T2**, *in* x_scale:**T1**, *in* x_zero_point:**T2**, *out* y:**T1**)|1+|**axis** = unknown|
+| | ||**x** = tensor(uint8), unknown|
+| | ||**x_scale** = tensor(float)|
+| | ||**x_zero_point** = tensor(uint8), unknown|
+| | ||**y** = tensor(float)|
+|ExpandDims|(*in* X:**T**, *in* axis:**tensor(int32)**, *out* Y:**T**)|1+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
 | | ||**axis** = tensor(int32)|
 |FusedConv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |FusedGemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|GatherND|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
+|GatherND|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(MLFloat16), tensor(bfloat16), tensor(string), tensor(int64), tensor(float), tensor(uint32), unknown, tensor(int32), tensor(uint8), tensor(int16), tensor(uint64), tensor(uint16), tensor(double), tensor(bool)|
+| | ||**Tind** = tensor(int64), tensor(int32)|
+|Gelu|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|MatMulInteger16|(*in* A:**T1**, *in* B:**T2**, *out* Y:**T3**)|1+|**T1** = tensor(int16)|
+| | ||**T2** = tensor(int16)|
+| | ||**T3** = tensor(int32)|
 |MaxpoolWithMask|(*in* X:**T**, *in* M:**tensor(int32)**, *out* Y:**T**)|1+|**X** = tensor(float)|
-|MurmurHash3|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = tensor(string), tensor(int32), tensor(uint32)|
-| | ||**T2** = tensor(int32), tensor(uint32)|
+|MurmurHash3|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = tensor(uint32), tensor(string), tensor(int32)|
+| | ||**T2** = tensor(uint32), tensor(int32)|
 |Pad|(*in* data:**T**, *in* pads:**tensor(int64)**, *in* value:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
-|Range|(*in* start:**T**, *in* limit:**T**, *in* delta:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(int16), tensor(double)|
+|QuantizeLinear|(*in* x:**T1**, *in* y_scale:**T1**, *in* y_zero_point:**T2**, *out* y:**T2**)|1+|**axis** = unknown|
+| | ||**x** = tensor(float)|
+| | ||**y** = tensor(uint8)|
+| | ||**y_scale** = tensor(float)|
+| | ||**y_zero_point** = tensor(uint8)|
+|Range|(*in* start:**T**, *in* limit:**T**, *in* delta:**T**, *out* Y:**T**)|1+|**T** = tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)|
 |SampleOp|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 |Tokenizer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(string)|
 |Unique|(*in* x:**T**, *out* y:**T**, *out* idx:**tensor(int64)**, *out* counts:**tensor(int64)**)|1+|**T** = tensor(float)|
@@ -300,171 +404,3 @@
 |ReorderOutput|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
 | |
 | |
-
-
-## Operators implemented by CUDAExecutionProvider
-
-| Op Name | Parameters | OpSet Version | Types Supported |
-|---------|------------|---------------|-----------------|
-**Operator Domain:** *ai.onnx.ml*
-|Abs|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Add|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Affine|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|And|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
-| | ||**T1** = tensor(bool)|
-|ArgMax|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ArgMin|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|AveragePool|(*in* X:**T**, *out* Y:**T**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[7, 9]|**I** = tensor(int64)|
-| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|9+|**B** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**X** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**mean** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**scale** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**var** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[7, 8]|**B** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**X** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**mean** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**scale** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**var** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Cast|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | |[6, 8]|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Ceil|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Compress|(*in* input:**T**, *in* condition:**T1**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T1** = tensor(bool)|
-|Concat|(*in* inputs:**T**, *out* concat_result:**T**)|4+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|ConstantOfShape|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int64)|
-| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ConvTranspose|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Crop|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Div|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Dropout|(*in* data:**T**, *out* output:**T**, *out* mask:**T**) or (*in* data:**T**, *out* output:**T**, *out* mask:**T1**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**T1** = tensor(bool)|
-| | |[7, 9]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|DynamicSlice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-|Elu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Equal|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(int32), tensor(bool), tensor(int64)|
-|Erf|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Exp|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Expand|(*in* input:**T**, *in* shape:**tensor(int64)**, *out* output:**T**)|8+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Flatten|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | |[1, 8]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Floor|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|GRU|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**T1** = tensor(int32)|
-|Gather|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[7, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Greater|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T1** = tensor(bool)|
-| | |[7, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|HardSigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Identity|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|ImageScaler|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|InstanceNormalization|(*in* input:**T**, *in* scale:**T**, *in* B:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|LSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**T1** = tensor(int32)|
-|LeakyRelu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Log|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|MatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[1, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Max|(*in* data_0:**T**, *out* max:**T**)|8+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[6, 7]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[1, 7]|**I** = tensor(int64)|
-| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[8, 9]|**I** = tensor(int64)|
-| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|MemcpyFromHost|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|MemcpyToHost|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Min|(*in* data_0:**T**, *out* min:**T**)|8+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |[6, 7]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Mul|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Neg|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), unknown, tensor(float), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Or|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
-| | ||**T1** = tensor(bool)|
-|PRelu|(*in* X:**T**, *in* slope:**T**, *out* Y:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Pad|(*in* data:**T**, *out* output:**T**)|2+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ParametricSoftplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Pow|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|RNN|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**T1** = tensor(int32)|
-|Reciprocal|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceL1|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceL2|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceLogSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceLogSumExp|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceMax|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceMean|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceMin|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceProd|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ReduceSumSquare|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Reshape|(*in* data:**T**, *in* shape:**tensor(int64)**, *out* reshaped:**T**) or (*in* data:**T**, *out* reshaped:**T**)|5+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**shape** = tensor(int64)|
-|Reshape_1||[1, 4]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Resize|(*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(float), tensor(MLFloat16), tensor(uint8), tensor(double)|
-|ScaledTanh|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Selu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Shape|(*in* data:**T**, *out* shape:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**T1** = tensor(int64)|
-|Shrink|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Sigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Slice|(*in* data:**T**, *out* output:**T**) or (*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *in* steps:**Tind**, *out* output:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-| | |[1, 9]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | ||**Tind** = tensor(int32), tensor(int64)|
-|Softmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Softplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Softsign|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Split|(*in* input:**T**, *out* outputs:**T**) or (*in* input:**T**, *in* split:**T**, *out* outputs...:**T**)|2+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Sqrt|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Squeeze|(*in* data:**T**, *out* squeezed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Sub|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Sum|(*in* data_0:**T**, *out* sum:**T**)|8+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-| | |[6, 7]|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Tanh|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|ThresholdedRelu|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | |10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Tile|(*in* input:**T**, *in* tiles:**T**, *in* axis:**T**, *out* output:**T**) or (*in* input:**T**, *in* repeats:**T1**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-| | ||**T1** = tensor(int64)|
-|Transpose|(*in* data:**T**, *out* transposed:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
-|Unsqueeze|(*in* data:**T**, *out* expanded:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
-|Upsample|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|[7, 9]|**T** = tensor(int32), tensor(float), tensor(MLFloat16), tensor(uint8), tensor(double)|
-|Xor|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
-| | ||**T1** = tensor(bool)|
-| |
-| |
-**Operator Domain:** *com.microsoft*
-|ConvTransposeWithDynamicPads|(*in* X:**T**, *in* W:**T**, *in* Pads:**tensor(int64)**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-| |
-| |
-
-
-## Operators implemented by MKLDNNExecutionProvider
-
-| Op Name | Parameters | OpSet Version | Types Supported |
-|---------|------------|---------------|-----------------|
-**Operator Domain:** *ai.onnx.ml*
-|AveragePool|(*in* X:**T**, *out* Y:**T**)|[7, 8]|**T** = tensor(float)|
-|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|7+|**T** = tensor(float)|
-|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
-|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|[1, 8]|**T** = tensor(float)|
-|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|[1, 8]|**T** = tensor(float)|
-|LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
-|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|[1, 7]|**T** = tensor(float)|
-| | |[8, 8]|**T** = tensor(float)|
-|Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
-|Sum|(*in* data_0:**T**, *out* sum:**T**)|6+|**T** = tensor(float)|
-| |
-| |
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index b484911f6d3a3..80901336e80ea 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1153,7 +1153,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int64_t, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t, TreeEnsembleClassifier);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, TreeEnsembleRegressor);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, float, TreeEnsembleRegressor);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double, TreeEnsembleRegressor);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int64_t, TreeEnsembleRegressor);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t, TreeEnsembleRegressor);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string, LabelEncoder);
@@ -1236,8 +1239,14 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
                                                                   TreeEnsembleClassifier)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t,
                                                                   TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1,
-                                                            TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, float,
+                                                                  TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double,
+                                                                  TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int64_t,
+                                                                  TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t,
+                                                                  TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string,
diff --git a/onnxruntime/core/providers/cpu/ml/treeregressor.cc b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
index 5af9fffa88713..ed015e0efafbb 100644
--- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
@@ -6,11 +6,19 @@
 namespace onnxruntime {
 namespace ml {
 
-ONNX_CPU_OPERATOR_ML_KERNEL(
-    TreeEnsembleRegressor,
-    1,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()).MayInplace(0, 0),
-    TreeEnsembleRegressor<float>);
+#define ADD_IN_TYPE_TREE_ENSEMBLE_REGRESSOR_OP(in_type)                                                                                                                                          \
+  ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(                                                                                                                                                              \
+      TreeEnsembleRegressor,                                                                                                                                                                     \
+      1,                                                                                                                                                                                          \
+      in_type,                                                                                                                                                                                    \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<in_type>()).MayInplace(0, 0), \
+      TreeEnsembleRegressor<in_type>);
+
+ADD_IN_TYPE_TREE_ENSEMBLE_REGRESSOR_OP(float);
+ADD_IN_TYPE_TREE_ENSEMBLE_REGRESSOR_OP(double);
+ADD_IN_TYPE_TREE_ENSEMBLE_REGRESSOR_OP(int64_t);
+ADD_IN_TYPE_TREE_ENSEMBLE_REGRESSOR_OP(int32_t);
+
 
 template <typename T>
 TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info)
diff --git a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
index 31cfcc7cadb5b..4d8782e121cc0 100644
--- a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
@@ -7,7 +7,8 @@
 namespace onnxruntime {
 namespace test {
 
-void GenTreeAndRunTest(const std::vector<float>& X, const std::vector<float>& base_values, const std::vector<float>& results, const std::string& aggFunction)
+template <typename T>
+void GenTreeAndRunTest(const std::vector<T>& X, const std::vector<float>& base_values, const std::vector<float>& results, const std::string& aggFunction)
 {
   OpTester test("TreeEnsembleRegressor", 1, onnxruntime::kMLDomain);
 
@@ -51,7 +52,7 @@ void GenTreeAndRunTest(const std::vector<float>& X, const std::vector<float>& ba
   } // default function is SUM
 
   //fill input data
-  test.AddInput<float>("X", {8, 3}, X);
+  test.AddInput<T>("X", {8, 3}, X);
   test.AddOutput<float>("Y", {8, 2}, results);
   test.Run();
 }
@@ -60,23 +61,31 @@ TEST(MLOpTest, TreeRegressorMultiTargetAverage) {
   std::vector<float> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
   std::vector<float> results = {1.33333333f, 29.f, 3.f, 14.f, 2.f, 23.f, 2.f, 23.f, 2.f, 23.f, 2.66666667f, 17.f, 2.f, 23.f, 3.f, 14.f};
   std::vector<float> base_values{0.f, 0.f};
-  GenTreeAndRunTest(X, base_values, results, "AVERAGE");
+  GenTreeAndRunTest<float>(X, base_values, results, "AVERAGE");
 }
 
 TEST(MLOpTest, TreeRegressorMultiTargetMin) {
   std::vector<float> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
   std::vector<float> results = {5.f, 28.f, 8.f, 19.f, 7.f, 28.f, 7.f, 28.f, 7.f, 28.f, 7.f, 19.f, 7.f, 28.f, 8.f, 19.f};
   std::vector<float> base_values{5.f, 5.f};
-  GenTreeAndRunTest(X, base_values, results, "MIN");
+  GenTreeAndRunTest<float>(X, base_values, results, "MIN");
 }
 
 TEST(MLOpTest, TreeRegressorMultiTargetMax) {
   std::vector<float> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
   std::vector<float> results = {2.f, 41.f, 3.f, 14.f, 2.f, 23.f, 2.f, 23.f, 2.f, 23.f, 3.f, 23.f, 2.f, 23.f, 3.f, 14.f};
   std::vector<float> base_values{0.f, 0.f};
-  GenTreeAndRunTest(X, base_values, results, "MAX");
+  GenTreeAndRunTest<float>(X, base_values, results, "MAX");
 }
 
+TEST(MLOpTest, TreeRegressorMultiTargetMaxDouble) {
+  std::vector<double> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
+  std::vector<float> results = {2.f, 41.f, 3.f, 14.f, 2.f, 23.f, 2.f, 23.f, 2.f, 23.f, 3.f, 23.f, 2.f, 23.f, 3.f, 14.f};
+  std::vector<float> base_values{0.f, 0.f};
+  GenTreeAndRunTest<double>(X, base_values, results, "MAX");
+}
+
+
 TEST(MLOpTest, TreeRegressorSingleTargetSum) {
   OpTester test("TreeEnsembleRegressor", 1, onnxruntime::kMLDomain);
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e9cb0b0418c80..cad6cd1ce40f6 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -831,23 +831,23 @@ def generate_documentation(source_dir, build_dir, configs):
     for config in configs:
         #copy the gen_doc.py
         shutil.copy(os.path.join(source_dir,'tools','python','gen_doc.py'),
-                    os.path.join(build_dir,config, config))
+                    os.path.join(build_dir,config, 'gen_doc.py'))
         shutil.copy(os.path.join(source_dir,'tools','python','gen_opkernel_doc.py'),
-                    os.path.join(build_dir,config, config))
+                    os.path.join(build_dir,config, 'gen_opkernel_doc.py'))
 
         run_subprocess([
                         sys.executable,
                         'gen_doc.py',
                         '--output_path', operator_doc_path
                     ],
-                    cwd = os.path.join(build_dir,config, config))
+                    cwd = os.path.join(build_dir,config))
 
         run_subprocess([
                         sys.executable,
                         'gen_opkernel_doc.py',
                         '--output_path', opkernel_doc_path
                     ],
-                    cwd = os.path.join(build_dir,config, config))
+                    cwd = os.path.join(build_dir,config))
 
     docdiff = ''
     try: