From c50144429ed4866df07ecd477a98e365c368fcc5 Mon Sep 17 00:00:00 2001
From: Alexandre Eichenberger <alexe@us.ibm.com>
Date: Thu, 19 Oct 2023 11:38:30 -0400
Subject: [PATCH] added unidirectional text for LayerNorm

Signed-off-by: Alexandre Eichenberger <alexe@us.ibm.com>
---
 docs/Changelog-ml.md |   93 +-
 docs/Changelog.md    | 1066 +++-----------
 docs/Operators-ml.md |  146 +-
 docs/Operators.md    | 3123 +++++++-----------------------------------
 onnx/defs/nn/defs.cc |    4 +-
 5 files changed, 696 insertions(+), 3736 deletions(-)
diff --git a/docs/Changelog-ml.md b/docs/Changelog-ml.md
index 0beb74a5022..f67a6854648 100644
--- a/docs/Changelog-ml.md
+++ b/docs/Changelog-ml.md
@@ -227,7 +227,7 @@ This version of the operator has been available since version 1 of the 'ai.onnx.
 ### <a name="ai.onnx.ml.FeatureVectorizer-1"></a>**ai.onnx.ml.FeatureVectorizer-1**</a>
 
   Concatenates input tensors into one continuous output.<br>
-      All input shapes are 2-D and are concatenated along the second dimension. 1-D tensors are treated as [1,C].
+      All input shapes are 2-D and are concatenated along the second dimention. 1-D tensors are treated as [1,C].
       Inputs are copied to the output maintaining the order of the input arguments.<br>
       All inputs must be integers or floats, while the output will be all floating point values.
 
@@ -740,7 +740,7 @@ This version of the operator has been available since version 1 of the 'ai.onnx.
 <dt><tt>nodes_hitrates</tt> : list of floats</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
@@ -813,7 +813,7 @@ This version of the operator has been available since version 1 of the 'ai.onnx.
 <dt><tt>nodes_hitrates</tt> : list of floats</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
@@ -1018,7 +1018,7 @@ This version of the operator has been available since version 3 of the 'ai.onnx.
 <dt><tt>nodes_hitrates_as_tensor</tt> : tensor</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
@@ -1085,9 +1085,9 @@ This version of the operator has been available since version 3 of the 'ai.onnx.
 <dt><tt>aggregate_function</tt> : string (default is SUM)</dt>
 <dd>Defines how to aggregate leaf values within a target. <br>One of 'AVERAGE,' 'SUM,' 'MIN,' 'MAX.'</dd>
 <dt><tt>base_values</tt> : list of floats</dt>
-<dd>Base values for regression, added to final prediction after applying aggregate_function; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
+<dd>Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
 <dt><tt>base_values_as_tensor</tt> : tensor</dt>
-<dd>Base values for regression, added to final prediction after applying aggregate_function; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
+<dd>Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
 <dt><tt>n_targets</tt> : int</dt>
 <dd>The total number of targets.</dd>
 <dt><tt>nodes_falsenodeids</tt> : list of ints</dt>
@@ -1099,7 +1099,7 @@ This version of the operator has been available since version 3 of the 'ai.onnx.
 <dt><tt>nodes_hitrates_as_tensor</tt> : tensor</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
@@ -1147,82 +1147,3 @@ This version of the operator has been available since version 3 of the 'ai.onnx.
 <dd>The input type must be a tensor of a numeric type.</dd>
 </dl>
 
-## Version 4 of the 'ai.onnx.ml' operator set
-### <a name="ai.onnx.ml.LabelEncoder-4"></a>**ai.onnx.ml.LabelEncoder-4**</a>
-
-  Maps each element in the input tensor to another value.<br>
-      The mapping is determined by the two parallel attributes, 'keys_*' and
-      'values_*' attribute. The i-th value in the specified 'keys_*' attribute
-      would be mapped to the i-th value in the specified 'values_*' attribute. It
-      implies that input's element type and the element type of the specified
-      'keys_*' should be identical while the output type is identical to the
-      specified 'values_*' attribute. Note that the 'keys_*' and 'values_*' attributes
-      must have the same length. If an input element can not be found in the
-      specified 'keys_*' attribute, the 'default_*' that matches the specified
-      'values_*' attribute may be used as its output value. The type of the 'default_*'
-      attribute must match the 'values_*' attribute chosen. <br>
-      Let's consider an example which maps a string tensor to an integer tensor.
-      Assume and 'keys_strings' is ["Amy", "Sally"], 'values_int64s' is [5, 6],
-      and 'default_int64' is '-1'.  The input ["Dori", "Amy", "Amy", "Sally",
-      "Sally"] would be mapped to [-1, 5, 5, 6, 6].<br>
-      Since this operator is an one-to-one mapping, its input and output shapes
-      are the same. Notice that only one of 'keys_*'/'values_*' can be set.<br>
-      Float keys with value 'NaN' match any input 'NaN' value regardless of bit
-      value. If a key is repeated, the last key takes precedence.
-
-#### Version
-
-This version of the operator has been available since version 4 of the 'ai.onnx.ml' operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>default_float</tt> : float (default is -0.0)</dt>
-<dd>A float.</dd>
-<dt><tt>default_int64</tt> : int (default is -1)</dt>
-<dd>An integer.</dd>
-<dt><tt>default_string</tt> : string (default is _Unused)</dt>
-<dd>A string.</dd>
-<dt><tt>default_tensor</tt> : tensor (default is {"_Unused"} if values_* has string type, {-1} if values_* has integral type, and {-0.f} if values_* has float type.)</dt>
-<dd>A default tensor.</dd>
-<dt><tt>keys_floats</tt> : list of floats</dt>
-<dd>A list of floats.</dd>
-<dt><tt>keys_int64s</tt> : list of ints</dt>
-<dd>A list of ints.</dd>
-<dt><tt>keys_strings</tt> : list of strings</dt>
-<dd>A list of strings.</dd>
-<dt><tt>keys_tensor</tt> : tensor</dt>
-<dd>Keys encoded as a 1D tensor. One and only one of 'keys_*'s should be set.</dd>
-<dt><tt>values_floats</tt> : list of floats</dt>
-<dd>A list of floats.</dd>
-<dt><tt>values_int64s</tt> : list of ints</dt>
-<dd>A list of ints.</dd>
-<dt><tt>values_strings</tt> : list of strings</dt>
-<dd>A list of strings.</dd>
-<dt><tt>values_tensor</tt> : tensor</dt>
-<dd>Values encoded as a 1D tensor. One and only one of 'values_*'s should be set.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> : T1</dt>
-<dd>Input data. It must have the same element type as the keys_* attribute set.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> : T2</dt>
-<dd>Output data. This tensor's element type is based on the values_* attribute set.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(string), tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)</dt>
-<dd>The input type is a tensor of any shape.</dd>
-<dt><tt>T2</tt> : tensor(string), tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)</dt>
-<dd>Output type is determined by the specified 'values_*' attribute.</dd>
-</dl>
-
diff --git a/docs/Changelog.md b/docs/Changelog.md
index 8abaa0489b0..dae76d3bce0 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -1579,7 +1579,7 @@ This version of the operator has been available since version 1 of the default O
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -3193,8 +3193,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the L1 norm of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3237,8 +3236,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the L2 norm of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3281,8 +3279,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the log sum of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3325,8 +3322,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the log sum exponent of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3369,8 +3365,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the max of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3413,8 +3408,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the mean of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields undefined.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3457,8 +3451,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the min of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3501,8 +3494,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the product of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 1.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3545,8 +3537,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the sum of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -3589,8 +3580,7 @@ This version of the operator has been available since version 1 of the default O
 
   Computes the sum square of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -7160,7 +7150,7 @@ This version of the operator has been available since version 7 of the default O
 <dt><tt>X</tt> : T</dt>
 <dd>Input tensor</dd>
 <dt><tt>slope</tt> : T</dt>
-<dd>Slope tensor. The shape of slope can be smaller than first input X; if so, its shape must be unidirectional broadcastable to X</dd>
+<dd>Slope tensor. The shape of slope can be smaller then first input X; if so, its shape must be unidirectional broadcastable to X</dd>
 </dl>
 
 #### Outputs
@@ -8556,7 +8546,7 @@ This version of the operator has been available since version 9 of the default O
   MaxUnpool essentially computes the partial inverse of the MaxPool op.
    The input information to this op is typically the output information from a MaxPool op. The first
    input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
+   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
    to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
    The third (optional) input is a tensor that specifies the output size of the unpooling operation.
 
@@ -8569,7 +8559,7 @@ This version of the operator has been available since version 9 of the default O
    known/predictable size.
 
   In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-   which define the exact unpooling op. The attributes typically have the same values as the corresponding
+   which define the exact unpooling op. The attributes typically have the same values as the corrsponding
    pooling op that the unpooling op is trying to invert.
 
 #### Version
@@ -8627,7 +8617,7 @@ This version of the operator has been available since version 9 of the default O
 
 <dl>
 <dt><tt>axes</tt> : list of ints (default is ['0', '2', '3'])</dt>
-<dd>A list of integers, along which to reduce. The default is to calculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
+<dd>A list of integers, along which to reduce. The default is to caculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
 </dl>
 
 #### Inputs
@@ -8716,7 +8706,7 @@ This version of the operator has been available since version 9 of the default O
 <dt><tt>indices</tt> : T1</dt>
 <dd>Input tensor containing indices. The values must be non-negative integers. Any entries in the 'indices' input tensor with values outside the range [0, depth) will result in one-hot representation with all 'off_value' values in the output tensor.In case 'indices' is of non-integer type, the values will be casted to int64 before use.</dd>
 <dt><tt>depth</tt> : T2</dt>
-<dd>Scalar or rank 1 tensor containing exactly one element, specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [0, depth). In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
+<dd>Scalar specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [0, depth). In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
 <dt><tt>values</tt> : T3</dt>
 <dd>Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], where 'on_value' is the value used for filling locations specified in 'indices' input tensor, and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor. </dd>
 </dl>
@@ -8756,7 +8746,7 @@ This version of the operator has been available since version 9 of the default O
 <dt><tt>X</tt> (differentiable) : T</dt>
 <dd>Input tensor</dd>
 <dt><tt>slope</tt> (differentiable) : T</dt>
-<dd>Slope tensor. The shape of slope can be smaller than first input X; if so, its shape must be unidirectional broadcastable to X</dd>
+<dd>Slope tensor. The shape of slope can be smaller then first input X; if so, its shape must be unidirectional broadcastable to X</dd>
 </dl>
 
 #### Outputs
@@ -10303,7 +10293,6 @@ This version of the operator has been deprecated since version 10 of the default
   Computes the indices of the max elements of the input tensor's element along the
   provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
   If keepdims equal 0, then the resulting tensor has the reduced dimension pruned.
-  The input tensor must not be empty.
   The type of the output tensor is integer.
 
 #### Version
@@ -10345,7 +10334,6 @@ This version of the operator has been available since version 11 of the default
   Computes the indices of the min elements of the input tensor's element along the
   provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
   If keepdims equal 0, then the resulting tensor has the reduced dimension pruned.
-  The input tensor must not be empty.
   The type of the output tensor is integer.
 
 #### Version
@@ -10402,17 +10390,11 @@ This version of the operator has been available since version 11 of the default
    * pad_shape[i] is sum of pads along axis i
    ```
 
-   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
    ```
    VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
    SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
    ```
-  or when ceil_mode is disabled:
-   ```
-   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
-   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor(input_spatial_shape[i] / strides_spatial_shape[i])
-   ```
-
    And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
    ```
    pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -10789,7 +10771,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>output_padding</tt> : list of ints</dt>
 <dd>Additional elements added to the side with higher coordinate indices in the output. Each padding value in "output_padding" must be less than the corresponding stride/dilation dimension. By default, this attribute is a zero vector. Note that this attribute doesn't directly affect the computed output values. It only controls the selection of the computed values, so changing this attribute only adds or removes output elements. If "output_shape" is explicitly provided, "output_padding" does not contribute additional size to "output_shape" but participates in the computation of the needed padding amount. This is also called adjs or adjustment in some frameworks.</dd>
 <dt><tt>output_shape</tt> : list of ints</dt>
-<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads. Note that the output_shape attribute value should not include dimensions for batch size and channels, which are automatically inferred.</dd>
+<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads</dd>
 <dt><tt>pads</tt> : list of ints</dt>
 <dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
 <dt><tt>strides</tt> : list of ints</dt>
@@ -10981,14 +10963,14 @@ This version of the operator has been available since version 11 of the default
 
 ### <a name="DynamicQuantizeLinear-11"></a>**DynamicQuantizeLinear-11**</a>
 
-  A Function to fuse calculation for Scale, Zero Point and FP32->8Bit conversion of FP32 Input data.
+  A Function to fuse calculation for Scale, Zero Point and FP32->8Bit convertion of FP32 Input data.
   Outputs Scale, ZeroPoint and Quantized Input for a given FP32 Input.
   Scale is calculated as:
   ```
-  y_scale = (maximum(0, max(x)) - minimum(0, min(x))) / (qmax - qmin)
+  y_scale = (max(x) - min(x))/(qmax - qmin)
   ```
 
-  * where qmax and qmin are max and min values for quantization range i.e. [0, 255] in case of uint8
+  * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
   * data range is adjusted to include 0.
 
   Zero point is calculated as:
@@ -11525,7 +11507,7 @@ This version of the operator has been available since version 11 of the default
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -11902,7 +11884,7 @@ This version of the operator has been available since version 11 of the default
   MaxUnpool essentially computes the partial inverse of the MaxPool op.
    The input information to this op is typically the output information from a MaxPool op. The first
    input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
+   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
    to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
    The third (optional) input is a tensor that specifies the output size of the unpooling operation.
 
@@ -11915,7 +11897,7 @@ This version of the operator has been available since version 11 of the default
    known/predictable size.
 
   In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-   which define the exact unpooling op. The attributes typically have the same values as the corresponding
+   which define the exact unpooling op. The attributes typically have the same values as the corrsponding
    pooling op that the unpooling op is trying to invert.
 
 #### Version
@@ -12045,7 +12027,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>indices</tt> (non-differentiable) : T1</dt>
 <dd>Input tensor containing indices. Any entries in the 'indices' input tensor with values outside the range [-depth, depth-1] will result in one-hot representation with all 'off_value' values in the output tensor.In case 'indices' is of non-integer type, the values will be casted to int64 before use.</dd>
 <dt><tt>depth</tt> (non-differentiable) : T2</dt>
-<dd>Scalar or Rank 1 tensor containing exactly one element, specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
+<dd>Scalar specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
 <dt><tt>values</tt> (non-differentiable) : T3</dt>
 <dd>Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], where 'on_value' is the value used for filling locations specified in 'indices' input tensor, and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor. </dd>
 </dl>
@@ -12420,8 +12402,7 @@ This version of the operator has been available since version 11 of the default
 
   Computes the max of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -12507,8 +12488,7 @@ This version of the operator has been available since version 11 of the default
 
   Computes the min of the input tensor's element along the provided axes. The resulting
   tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-  the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+  the resulted tensor have the reduced dimension pruned.
 
   The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
   False instead of True.
@@ -12758,7 +12738,7 @@ x_original = length_resized > 1 ? start_x * (length_original - 1) + x_resized *
 
   Round takes one input Tensor and rounds the values, element-wise, meaning
   it finds the nearest integer for each value.
-  In case of halves, the rule is to round them to the nearest even integer.
+  In case of halfs, the rule is to round them to the nearest even integer.
   If input x is integral, +0, -0, NaN,  or infinite, x itself is returned.
   The output tensor has the same shape and type as the input.
 
@@ -13736,8 +13716,8 @@ This version of the operator has been available since version 11 of the default
 
   This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs.
   The first output tensor 'Y' contains all unique values or subtensors of the input.
-  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurrence in 'X'.
-  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'.
+  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'..
+  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ".
   The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input.
 
   Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input.
@@ -13855,7 +13835,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>Y</tt> (non-differentiable) : T</dt>
 <dd>A tensor of the same type as 'X' containing all the unique values or subtensors sliced along a provided 'axis' in 'X', either sorted or maintained in the same order they occur in input 'X'</dd>
 <dt><tt>indices</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>A 1-D INT64 tensor containing indices of 'Y' elements' first occurrence in 'X'. When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. When 'axis' is not provided, it contains indices to values in the flattened input tensor. </dd>
+<dd>A 1-D INT64 tensor containing indices of 'Y' elements' first occurance in 'X'. When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. When 'axis' is not provided, it contains indices to values in the flattened input tensor. </dd>
 <dt><tt>inverse_indices</tt> (optional, non-differentiable) : tensor(int64)</dt>
 <dd>A 1-D INT64 tensor containing, for elements of 'X', its corresponding indices in 'Y'. When 'axis' is provided, it contains indices to subtensors in output 'Y' on the 'axis'. When 'axis' is not provided, it contains indices to values in output 'Y'. </dd>
 <dt><tt>counts</tt> (optional, non-differentiable) : tensor(int64)</dt>
@@ -14195,7 +14175,7 @@ This version of the operator has been available since version 12 of the default
   An einsum of the form `term1, term2 -> output-term` produces an output tensor using the following equation
 
   ```
-  output[output-term] = reduce-sum( input1[term1] * input2[term2] )
+  output[output-term] = reduce-sum( input1[term1] * input2[term] )
   ```
 
   where the reduce-sum performs a summation over all the indices occurring in the input terms (term1, term2)
@@ -14487,28 +14467,21 @@ This version of the operator has been available since version 12 of the default
    the tensor according to kernel sizes, stride sizes, and pad lengths.
    max pooling consisting of computing the max on all values of a
    subset of the input tensor according to the kernel size and downsampling the
-   data into the output tensor Y for further processing. The output spatial shape is calculated differently
-   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   data into the output tensor Y for further processing. The output spatial shape will be following:
    ```
-   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
    or
    ```
-   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+   if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
 
-   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
    ```
    VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
    SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
    ```
-   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
-   ```
-   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
-   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
-   ```
    And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
    ```
    pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -14841,7 +14814,7 @@ This version of the operator has been available since version 12 of the default
   shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
           with K >= 1 in case of K-dimensional loss.
 
-  The loss for one sample, l_i, can calculated as follows:
+  The loss for one sample, l_i, can caculated as follows:
       l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
   or
       l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk] * weights[c], if 'weights' is provided.
@@ -14906,7 +14879,7 @@ This version of the operator has been available since version 12 of the default
 ### <a name="Abs-13"></a>**Abs-13**</a>
 
   Absolute takes one input data (Tensor<T>) and produces one output data
-  (Tensor<T>) where absolute value, y = abs(x), is applied to
+  (Tensor<T>) where the absolute is, y = abs(x), is applied to
   the tensor elementwise.
 
 #### Version
@@ -15894,50 +15867,57 @@ This version of the operator has been available since version 13 of the default
 
   This operator is the inverse of `ScatterND`.
 
-  **Example 1**
+  `Example 1`
 
-  ```
-  batch_dims = 0
-  data    = [[0,1],[2,3]]   # data_shape    = [2, 2]
-  indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
-  output  = [0,3]           # output_shape  = [2]
-  ```
+    batch_dims = 0
 
-  **Example 2**
+    data    = [[0,1],[2,3]]   # data_shape = [2, 2]
 
-  ```
-  batch_dims = 0
-  data    = [[0,1],[2,3]]  # data_shape    = [2, 2]
-  indices = [[1],[0]]      # indices_shape = [2, 1]
-  output  = [[2,3],[0,1]]  # output_shape  = [2, 2]
-  ```
+    indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
 
-  **Example 3**
+    output  = [0,3]           # output_shape = [2]
 
-  ```
-  batch_dims = 0
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
-  output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
-  ```
+  `Example 2`
 
-  **Example 4**
+    batch_dims = 0
 
-  ```
-  batch_dims = 0
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
-  output  = [[[2,3]],[[4,5]]]             # output_shape  = [2, 1, 2]
-  ```
+    data    = [[0,1],[2,3]]  # data_shape = [2, 2]
+
+    indices = [[1],[0]]      # indices_shape = [2, 1]
+
+    output  = [[2,3],[0,1]]  # output_shape = [2, 2]
+
+  `Example 3`
+
+    batch_dims = 0
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
+
+    output  = [[2,3],[4,5]]                 # output_shape = [2, 2]
+
+  `Example 4`
+
+    batch_dims = 0
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
+
+    output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2]
+
+  `Example 5`
+
+    batch_dims = 1
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[1],[0]]             # indices_shape = [2, 1]
+
+    output  = [[2,3],[4,5]]             # output_shape = [2, 2]
 
-  **Example 5**
 
-  ```
-  batch_dims = 1
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[1],[0]]                     # indices_shape = [2, 1]
-  output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
-  ```
 
 #### Version
 
@@ -16162,7 +16142,7 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -16658,7 +16638,7 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>axes</tt> : list of ints (default is ['0', '2', '3'])</dt>
-<dd>A list of integers, along which to reduce. The default is to calculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
+<dd>A list of integers, along which to reduce. The default is to caculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
 </dl>
 
 #### Inputs
@@ -17234,13 +17214,12 @@ This version of the operator has been available since version 13 of the default
 ### <a name="ReduceL1-13"></a>**ReduceL1-13**</a>
 
   Computes the L1 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17273,19 +17252,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceL2-13"></a>**ReduceL2-13**</a>
 
   Computes the L2 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17318,19 +17296,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceLogSum-13"></a>**ReduceLogSum-13**</a>
 
   Computes the log sum of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17363,19 +17340,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceLogSumExp-13"></a>**ReduceLogSumExp-13**</a>
 
   Computes the log sum exponent of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17408,19 +17384,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMax-13"></a>**ReduceMax-13**</a>
 
   Computes the max of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17453,19 +17428,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMean-13"></a>**ReduceMean-13**</a>
 
   Computes the mean of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields undefined.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17498,19 +17472,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMin-13"></a>**ReduceMin-13**</a>
 
   Computes the min of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17543,19 +17516,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceProd-13"></a>**ReduceProd-13**</a>
 
   Computes the product of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 1.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17588,19 +17560,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceSum-13"></a>**ReduceSum-13**</a>
 
   Computes the sum of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17635,19 +17606,18 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceSumSquare-13"></a>**ReduceSumSquare-13**</a>
 
   Computes the sum square of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -17680,7 +17650,7 @@ This version of the operator has been available since version 13 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="Relu-13"></a>**Relu-13**</a>
@@ -18138,16 +18108,16 @@ This version of the operator has been available since version 13 of the default
   Slice uses the `starts`, `ends`, `axes` and `steps` inputs to select a sub-tensor
   of its input `data` tensor.
 
-  An effective `starts[i]`, `ends[i]`, and `steps[i]` must be computed for each `i`
+  An effective `start[i]`, `end[i]`, and `step[i]` must be computed for each `i`
   in `[0, ... r-1]` where `r = rank(input)` as follows:
 
   If `axes` are omitted, they are set to `[0, ..., r-1]`.
   If `steps` are omitted, they are set to `[1, ..., 1]` of length `len(starts)`
 
-  The effective values are initialized as `start[i] = 0`, `ends[i] = dims[i]` where
-  `dims` are the dimensions of `input` and `steps[i] = 1`.
+  The effective values are initialized as `start[i] = 0`, `end[i] = dims[i]` where
+  `dims` are the dimensions of `input` and `step[i] = `1.
 
-  All negative elements of `axes` are made non-negative by adding `r` to them, where
+  All negative elements of `axes` are made non-negatve by adding `r` to them, where
   `r =rank(input)`.
 
   All negative values in `starts[i]` and `ends[i]` have `dims[axes[i]]` added to them,
@@ -18157,10 +18127,10 @@ This version of the operator has been available since version 13 of the default
 
   The clamping for the adjusted `ends[i]` depends on the sign of `steps[i]` and must
   accommodate copying 0 through `dims[axes[i]]` elements, so for positive stepping
-  `ends[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
+  `end[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
   is clamped to `[-1, dims[axes[i]]-1]`.
 
-  Finally, `steps[axes[i]] = steps[i]`.
+  Finally, `step[axes[i]] = steps[i]`.
 
   For slicing to the end of a dimension with unknown size, it is recommended to pass
   in `INT_MAX` when slicing forward and 'INT_MIN' when slicing backward.
@@ -18291,7 +18261,7 @@ from the back. Accepted range is [-r, r-1] where r = rank(input).
   * shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
     with K >= 1 in case of K-dimensional loss.
 
-  The loss for one sample, l_i, can calculated as follows:
+  The loss for one sample, l_i, can caculated as follows:
   ```
   l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
   ```
@@ -20057,7 +20027,7 @@ This version of the operator has been available since version 16 of the default
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -20345,7 +20315,7 @@ This version of the operator has been available since version 16 of the default
 <dt><tt>X</tt> (differentiable) : T</dt>
 <dd>Input tensor</dd>
 <dt><tt>slope</tt> (differentiable) : T</dt>
-<dd>Slope tensor. The shape of slope can be smaller than first input X; if so, its shape must be unidirectional broadcastable to X</dd>
+<dd>Slope tensor. The shape of slope can be smaller then first input X; if so, its shape must be unidirectional broadcastable to X</dd>
 </dl>
 
 #### Outputs
@@ -20891,27 +20861,27 @@ This version of the operator has been available since version 17 of the default
 
 <dl>
 <dt><tt>axis</tt> : int (default is 1)</dt>
-<dd>The axis on which to perform the DFT. By default this value is set to 1, which corresponds to the first dimension after the batch index. Negative value means counting dimensions from the back. Accepted range is $[-r, -2] \cup [0, r-2]$ where `r = rank(input)`. The last dimension is for representing complex numbers and thus is an invalid axis.</dd>
+<dd>The axis on which to perform the DFT. By default this value is set to 1, which corresponds to the first dimension after the batch index.</dd>
 <dt><tt>inverse</tt> : int (default is 0)</dt>
 <dd>Whether to perform the inverse discrete fourier transform. By default this value is set to 0, which corresponds to false.</dd>
 <dt><tt>onesided</tt> : int (default is 0)</dt>
-<dd>If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m, n_fft-w]*. Note if the input or window tensors are complex, then onesided output is not possible. Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT). When invoked with real or complex valued input, the default value is 0. Values can be 0 or 1.</dd>
+<dd>If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if the input or window tensors are complex, then onesided output is not possible. Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT). When invoked with real or complex valued input, the default value is 0. Values can be 0 or 1.</dd>
 </dl>
 
 #### Inputs (1 - 2)
 
 <dl>
 <dt><tt>input</tt> (non-differentiable) : T1</dt>
-<dd>For real input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first dimension is the batch dimension. The following N dimensions correspond to the signal's dimensions. The final dimension represents the real and imaginary parts of the value in that order.</dd>
+<dd>For real input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first dimension is the batch dimension. The following N dimentions correspond to the signal's dimensions. The final dimension represents the real and imaginary parts of the value in that order.</dd>
 <dt><tt>dft_length</tt> (optional, non-differentiable) : T2</dt>
-<dd>The length of the signal as a scalar. If greater than the axis dimension, the signal will be zero-padded up to dft_length. If less than the axis dimension, only the first dft_length values will be used as the signal. It's an optional value. </dd>
+<dd>The length of the signal.If greater than the axis dimension, the signal will be zero-padded up to dft_length. If less than the axis dimension, only the first dft_length values will be used as the signal. It's an optional value. </dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>output</tt> : T1</dt>
-<dd>The Fourier Transform of the input vector. If onesided is 0, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. If axis=1 and onesided is 1, the following shape is expected: [batch_idx][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]. If axis=2 and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][floor(signal_dim2/2)+1]...[signal_dimN][2]. If axis=N and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]. The signal_dim at the specified axis is equal to the dft_length.</dd>
+<dd>The Fourier Transform of the input vector.If onesided is 0, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. If axis=1 and onesided is 1, the following shape is expected: [batch_idx][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]. If axis=2 and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][floor(signal_dim2/2)+1]...[signal_dimN][2]. If axis=N and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]. The signal_dim at the specified axis is equal to the dft_length.</dd>
 </dl>
 
 #### Type Constraints
@@ -21053,7 +21023,7 @@ This version of the operator has been available since version 17 of the default
 
 <dl>
 <dt><tt>axis</tt> : int (default is -1)</dt>
-<dd>The first normalization dimension. If rank(X) is r, axis' allowed range is [-r, r). Negative value means counting dimensions from the back.</dd>
+<dd>The first normalization dimension. If rank(X) is r, axis' allowed range is [-r, r]. Negative value means counting dimensions from the back.</dd>
 <dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
 <dd>The epsilon value to use to avoid division by zero.</dd>
 <dt><tt>stash_type</tt> : int (default is 1)</dt>
@@ -21823,13 +21793,12 @@ This version of the operator has been available since version 18 of the default
 ### <a name="ReduceL1-18"></a>**ReduceL1-18**</a>
 
   Computes the L1 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21864,19 +21833,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceL2-18"></a>**ReduceL2-18**</a>
 
   Computes the L2 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21911,19 +21879,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceLogSum-18"></a>**ReduceLogSum-18**</a>
 
   Computes the log sum of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21958,19 +21925,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceLogSumExp-18"></a>**ReduceLogSumExp-18**</a>
 
   Computes the log sum exponent of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22005,19 +21971,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMax-18"></a>**ReduceMax-18**</a>
 
   Computes the max of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22052,19 +22017,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMean-18"></a>**ReduceMean-18**</a>
 
   Computes the mean of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields undefined.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22099,19 +22063,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceMin-18"></a>**ReduceMin-18**</a>
 
   Computes the min of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22146,19 +22109,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceProd-18"></a>**ReduceProd-18**</a>
 
   Computes the product of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 1.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22193,19 +22155,18 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="ReduceSumSquare-18"></a>**ReduceSumSquare-18**</a>
 
   Computes the sum square of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22240,7 +22201,7 @@ This version of the operator has been available since version 18 of the default
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 ### <a name="Resize-18"></a>**Resize-18**</a>
@@ -22371,8 +22332,8 @@ Note: `round_int` stands for computing the nearest integer value, rounding halfw
   ```
   When `reduction` is set to some reduction function `f`, the update corresponding to the [i][j] entry is performed as below:
   ```
-  output[indices[i][j]][j] = f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
-  output[i][indices[i][j]] = f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
+  output[indices[i][j]][j] += f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
+  output[i][indices[i][j]] += f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
   ```
   where the `f` is `+`, `*`, `max` or `min` as specified.
 
@@ -22615,28 +22576,21 @@ This version of the operator has been available since version 18 of the default
    the tensor according to kernel sizes, stride sizes, and pad lengths.
    average pooling consisting of computing the average on all values of a
    subset of the input tensor according to the kernel size and downsampling the
-   data into the output tensor Y for further processing. The output spatial shape is calculated differently
-   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   data into the output tensor Y for further processing. The output spatial shape will be following:
    ```
-   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
    or
    ```
-   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+   if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
 
-   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
    ```
    VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
    SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
    ```
-   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
-   ```
-   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
-   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
-   ```
    And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
    ```
    pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -22980,7 +22934,7 @@ This version of the operator has been available since version 19 of the default
 <dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
 <dd>Constrain 'x_zero_point' and 'x' to 8-bit integer or float, or /32-bit integer tensor.</dd>
 <dt><tt>T2</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
-<dd>'x_scale' determines the output type.</dd>
+<dd>'y_scale' determines the output type.</dd>
 </dl>
 
 ### <a name="Equal-19"></a>**Equal-19**</a>
@@ -23069,7 +23023,7 @@ This version of the operator has been available since version 19 of the default
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -23917,620 +23871,6 @@ This version of the operator has been available since version 19 of the default
 <dd>Constrain output to int64 tensor, which should be a scalar though.</dd>
 </dl>
 
-## Version 20 of the default ONNX operator set
-### <a name="AffineGrid-20"></a>**AffineGrid-20**</a>
-
-  Generates a 2D or 3D flow field (sampling grid), given a batch of affine matrices theta
-  (https://pytorch.org/docs/stable/generated/torch.nn.functional.affine_grid.html).
-  An affine matrix `theta` is applied to a position tensor represented in its homogeneous expression. Here is an example in 3D:
-  ```
-  [r00, r01, r02, t0]   [x]   [x']
-  [r10, r11, r12, t1] * [y] = [y']
-  [r20, r21, r22, t2]   [z]   [z']
-  [0,   0,   0,   1 ]   [1]   [1 ]
-  ```
-  where `(x, y, z)` is the position in the original space, `(x', y', z')` is the position in the output space.
-  The last row is always `[0, 0, 0, 1]` and is not stored in the affine matrix. Therefore we have `theta` of shape `(N, 2, 3)` for 2D or `(N, 3, 4)` for 3D.
-
-  Input `size` is used to define grid of positions evenly spaced in the original 2D or 3D space, with dimensions ranging from `-1` to `1`.
-  The output `grid` contains positions in the output space.
-
-  When `align_corners=1`, consider `-1` and `1` to refer to the centers of the corner pixels (mark `v` in illustration).
-  ```
-  v            v            v            v
-  |-------------------|------------------|
-  -1                  0                  1
-  ```
-  When `align_corners=0`, consider `-1` and `1` to refer to the outer edge of the corner pixels.
-  ```
-      v        v         v         v
-  |------------------|-------------------|
-  -1                 0                   1
-  ```
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>align_corners</tt> : int (default is 0)</dt>
-<dd>if align_corners=1, consider -1 and 1 to refer to the centers of the corner pixels. if align_corners=0, consider -1 and 1 to refer to the outer edge the corner pixels.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>theta</tt> (non-differentiable) : T1</dt>
-<dd>input batch of affine matrices with shape (N, 2, 3) for 2D or (N, 3, 4) for 3D</dd>
-<dt><tt>size</tt> (non-differentiable) : T2</dt>
-<dd>the target output image size (N, C, H, W) for 2D or (N, C, D, H, W) for 3D</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>grid</tt> (differentiable) : T1</dt>
-<dd>output tensor of shape (N, H, W, 2) of 2D sample coordinates or (N, D, H, W, 3) of 3D sample coordinates.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
-<dd>Constrain grid types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(int64)</dt>
-<dd>Constrain size's type to int64 tensors.</dd>
-</dl>
-
-### <a name="ConstantOfShape-20"></a>**ConstantOfShape-20**</a>
-
-  Generate a tensor with given value and shape.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>value</tt> : tensor</dt>
-<dd>(Optional) The value of the output elements.Should be a one-element tensor. If not specified, it defaults to a tensor of value 0 and datatype float32</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>input</tt> : T1</dt>
-<dd>1D tensor. The shape of the expected output tensor. If empty tensor is given, the output would be a scalar. All values must be >= 0.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>output</tt> : T2</dt>
-<dd>Output tensor of shape specified by 'input'.If attribute 'value' is specified, the value and datatype of the output tensor is taken from 'value'.If attribute 'value' is not specified, the value in the output defaults to 0, and the datatype defaults to float32.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(int64)</dt>
-<dd>Constrain input types.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>Constrain output types to be numerics.</dd>
-</dl>
-
-### <a name="DFT-20"></a>**DFT-20**</a>
-
-  Computes the discrete Fourier Transform (DFT) of the input.
-
-  Assuming the input has shape `[M, N]`, where `N` is the dimension over which the
-  DFT is computed and `M` denotes the conceptual "all other dimensions,"
-  the DFT `y[m, k]` of shape `[M, N]` is defined as
-
-  $$y[m, k] = \sum_{n=0}^{N-1} e^{-2 \pi j \frac{k n}{N} } x[m, n] ,$$
-
-  and the inverse transform is defined as
-
-  $$x[m, n] = \frac{1}{N} \sum_{k=0}^{N-1} e^{2 \pi j \frac{k n}{N} } y[m, k] ,$$
-
-  where $j$ is the imaginary unit.
-
-  The actual shape of the output is specified in the "output" section.
-
-  Reference: https://docs.scipy.org/doc/scipy/tutorial/fft.html
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>inverse</tt> : int (default is 0)</dt>
-<dd>Whether to perform the inverse discrete Fourier Transform. Default is 0, which corresponds to `false`.</dd>
-<dt><tt>onesided</tt> : int (default is 0)</dt>
-<dd>If `onesided` is `1` and input is real, only values for `k` in `[0, 1, 2, ..., floor(n_fft/2) + 1]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., `X[m, k] = X[m, n_fft-k]*`, where `m` denotes "all other dimensions" DFT was not applied on. If the input tensor is complex, onesided output is not possible. Value can be `0` or `1`. Default is `0`.</dd>
-</dl>
-
-#### Inputs (1 - 3)
-
-<dl>
-<dt><tt>input</tt> (non-differentiable) : T1</dt>
-<dd>For real input, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][1]`. For complex input, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][2]`. The final dimension represents the real and imaginary parts of the value in that order.</dd>
-<dt><tt>dft_length</tt> (optional, non-differentiable) : T2</dt>
-<dd>The length of the signal as a scalar. If greater than the axis dimension, the signal will be zero-padded up to `dft_length`. If less than the axis dimension, only the first `dft_length` values will be used as the signal. </dd>
-<dt><tt>axis</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>The axis as a scalar on which to perform the DFT. Default is `-2` (last signal axis). Negative value means counting dimensions from the back. Accepted range is $[-r, -2] \cup [0, r-2]$ where `r = rank(input)`. The last dimension is for representing complex numbers and thus is an invalid axis.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>output</tt> : T1</dt>
-<dd>The Fourier Transform of the input vector. If `onesided` is `0`, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][2]`. If `axis=0` and `onesided` is `1`, the following shape is expected: `[floor(signal_dim0/2)+1][signal_dim1][signal_dim2]...[signal_dimN][2]`. If `axis=1` and `onesided` is `1`, the following shape is expected: `[signal_dim0][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]`. If `axis=N` and `onesided` is `1`, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]`. The `signal_dim` at the specified `axis` is equal to the `dft_length`.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
-<dd>Constrain input and output types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(int32), tensor(int64)</dt>
-<dd>Constrain scalar length types to integers.</dd>
-</dl>
-
-### <a name="Gelu-20"></a>**Gelu-20**</a>
-
-  Gelu takes one input data (Tensor<T>) and produces one
-  output data (Tensor<T>) where the gaussian error linear units function,
-  $y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
-  If the attribute "approximate" is set to "tanh", the function estimation,
-  $y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
-  to the tensor elementwise.
-
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>approximate</tt> : string (default is none)</dt>
-<dd>Gelu approximation algorithm: `"tanh"`, `"none"`(default).`"none"`: do not use approximation.`"tanh"`: use tanh approximation.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (differentiable) : T</dt>
-<dd>Input tensor</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (differentiable) : T</dt>
-<dd>Output tensor</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to float tensors.</dd>
-</dl>
-
-### <a name="GridSample-20"></a>**GridSample-20**</a>
-
-  Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
-  For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
-  the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
-  the `grid` will have shape (N, D_out, H_out, W_out, 3), the output `Y` will have shape (N, C, D_out, H_out, W_out).
-  More generally, for an input `X` of rank r+2 with shape (N, C, d1, d2, ..., dr),
-  the `grid` will have shape (N, D1_out, D2_out, ..., Dr_out, r), the output `Y` will have shape (N, C, D1_out, D2_out, ..., Dr_out).
-
-  The tensor `X` contains values at centers of square pixels (voxels, etc) locations such as (n, c, d1_in, d2_in, ..., dr_in).
-  The (n, d1_out, d2_out, ..., dr_out, :) values from the tensor `grid` are the normalized positions for interpolating the values
-  at the (n, c, d1_out, d2_out, ..., dr_out) locations from the output tensor `Y` using a specified interpolation method (the mode)
-  and a padding mode (for `grid` positions falling outside the 2-dimensional image).
-
-  For example, the values in `grid[n, h_out, w_out, :]` are size-2 vectors specifying normalized positions in the 2-dimensional space of `X`.
-  They are used to interpolate output values of `Y[n, c, h_out, w_out]`.
-
-  The GridSample operator is often used in doing grid generator and sampler in the
-  [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
-  See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>align_corners</tt> : int (default is 0)</dt>
-<dd>If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels (voxels, etc.). If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels (voxels, etc.), making the sampling more resolution agnostic.</dd>
-<dt><tt>mode</tt> : string (default is linear)</dt>
-<dd>Three interpolation modes: linear (default), nearest and cubic. The "linear" mode includes linear and N-linear interpolation modes depending on the number of spatial dimensions of the input tensor (i.e. linear for 1 spatial dimension, bilinear for 2 spatial dimensions, etc.). The "cubic" mode also includes N-cubic interpolation modes following the same rules. The "nearest" mode rounds to the nearest even index when the sampling point falls halfway between two indices.</dd>
-<dt><tt>padding_mode</tt> : string (default is zeros)</dt>
-<dd>Support padding modes for outside grid values: `zeros`(default), `border`, `reflection`. zeros: use 0 for out-of-bound grid locations, border: use border values for out-of-bound grid locations, reflection: use values at locations reflected by the border for out-of-bound grid locations. If index 0 represents the margin pixel, the reflected value at index -1 will be the same as the value at index 1. For location far away from the border, it will keep being reflected until becoming in bound. If pixel location x = -3.5 reflects by border -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' = 0.5.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (differentiable) : T1</dt>
-<dd>Input tensor of rank r+2 that has shape (N, C, D1, D2, ..., Dr), where N is the batch size, C is the number of channels, D1, D2, ..., Dr are the spatial dimensions.</dd>
-<dt><tt>grid</tt> (non-differentiable) : T2</dt>
-<dd>Input offset of shape (N, D1_out, D2_out, ..., Dr_out, r), where D1_out, D2_out, ..., Dr_out are the spatial dimensions of the grid and output, and r is the number of spatial dimensions. Grid specifies the sampling locations normalized by the input spatial dimensions. Therefore, it should have most values in the range of [-1, 1]. If the grid has values outside the range of [-1, 1], the corresponding outputs will be handled as defined by padding_mode. Following computer vision convention, the coordinates in the length-r location vector are listed from the innermost tensor dimension to the outermost, the opposite of regular tensor indexing.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (differentiable) : T1</dt>
-<dd>Output tensor of rank r+2 that has shape (N, C, D1_out, D2_out, ..., Dr_out) of the sampled values. For integer input types, intermediate values are computed as floating point and cast to integer at the end.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input `X` and output `Y` types to all tensor types.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
-<dd>Constrain grid types to float tensors.</dd>
-</dl>
-
-### <a name="ImageDecoder-20"></a>**ImageDecoder-20**</a>
-
-  Loads and decodes and image from a file. If it can't decode for any reason (e.g. corrupted encoded
-  stream, invalid format, it will return an empty matrix).
-  The following image formats are supported:
-  * BMP
-  * JPEG (note: Lossless JPEG support is optional)
-  * JPEG2000
-  * TIFF
-  * PNG
-  * WebP
-  * Portable image format (PBM, PGM, PPM, PXM, PNM)
-  Decoded images follow a channel-last layout: (Height, Width, Channels).
-  **JPEG chroma upsampling method:**
-  When upsampling the chroma components by a factor of 2, the pixels are linearly interpolated so that the
-  centers of the output pixels are 1/4 and 3/4 of the way between input pixel centers.
-  When rounding, 0.5 is rounded down and up at alternative pixels locations to prevent bias towards
-  larger values (ordered dither pattern).
-  Considering adjacent input pixels A, B, and C, B is upsampled to pixels B0 and B1 so that
-  ```
-  B0 = round_half_down((1/4) * A + (3/4) * B)
-  B1 = round_half_up((3/4) * B + (1/4) * C)
-  ```
-  This method,  is the default chroma upsampling method in the well-established libjpeg-turbo library,
-  also referred as "smooth" or "fancy" upsampling.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>pixel_format</tt> : string (default is RGB)</dt>
-<dd>Pixel format. Can be one of "RGB", "BGR", or "Grayscale".</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>encoded_stream</tt> (non-differentiable) : T1</dt>
-<dd>Encoded stream</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>image</tt> (non-differentiable) : T2</dt>
-<dd>Decoded image</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(uint8)</dt>
-<dd>Constrain input types to 8-bit unsigned integer tensor.</dd>
-<dt><tt>T2</tt> : tensor(uint8)</dt>
-<dd>Constrain output types to 8-bit unsigned integer tensor.</dd>
-</dl>
-
-### <a name="IsInf-20"></a>**IsInf-20**</a>
-
-  Map infinity to true and other values to false.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>detect_negative</tt> : int (default is 1)</dt>
-<dd>(Optional) Whether map negative infinity to true. Default to 1 so that negative infinity induces true. Set this attribute to 0 if negative infinity should be mapped to false.</dd>
-<dt><tt>detect_positive</tt> : int (default is 1)</dt>
-<dd>(Optional) Whether map positive infinity to true. Default to 1 so that positive infinity induces true. Set this attribute to 0 if positive infinity should be mapped to false.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>input</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>output</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>Constrain input types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(bool)</dt>
-<dd>Constrain output types to boolean tensors.</dd>
-</dl>
-
-### <a name="IsNaN-20"></a>**IsNaN-20**</a>
-
-  Returns which elements of the input are NaN.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>input</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>output</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>Constrain input types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(bool)</dt>
-<dd>Constrain output types to boolean tensors.</dd>
-</dl>
-
-### <a name="ReduceMax-20"></a>**ReduceMax-20**</a>
-
-  Computes the max of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
-  the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
-
-
-  If the input data type is Boolean, the comparison should consider `False < True`.
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>keepdims</tt> : int (default is 1)</dt>
-<dd>Keep the reduced dimension or not, default 1 means keep reduced dimension.</dd>
-<dt><tt>noop_with_empty_axes</tt> : int (default is 0)</dt>
-<dd>Defines behavior if 'axes' is empty. Default behavior with 'false' is to reduce all axes. When axes is empty and this attribute is set to true, input tensor will not be reduced,and the output tensor would be equivalent to input tensor.</dd>
-</dl>
-
-#### Inputs (1 - 2)
-
-<dl>
-<dt><tt>data</tt> (differentiable) : T</dt>
-<dd>An input tensor.</dd>
-<dt><tt>axes</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>Optional input list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor if 'noop_with_empty_axes' is false, else act as an Identity op when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1] where r = rank(data).</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>reduced</tt> (differentiable) : T</dt>
-<dd>Reduced output tensor.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8), tensor(bool)</dt>
-<dd>Constrain input and output types to numeric and Boolean tensors.</dd>
-</dl>
-
-### <a name="ReduceMin-20"></a>**ReduceMin-20**</a>
-
-  Computes the min of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
-  the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
-
-
-  If the input data type is Boolean, the comparison should consider `False < True`.
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>keepdims</tt> : int (default is 1)</dt>
-<dd>Keep the reduced dimension or not, default 1 means keep reduced dimension.</dd>
-<dt><tt>noop_with_empty_axes</tt> : int (default is 0)</dt>
-<dd>Defines behavior if 'axes' is empty. Default behavior with 'false' is to reduce all axes. When axes is empty and this attribute is set to true, input tensor will not be reduced,and the output tensor would be equivalent to input tensor.</dd>
-</dl>
-
-#### Inputs (1 - 2)
-
-<dl>
-<dt><tt>data</tt> (differentiable) : T</dt>
-<dd>An input tensor.</dd>
-<dt><tt>axes</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>Optional input list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor if 'noop_with_empty_axes' is false, else act as an Identity op when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1] where r = rank(data).</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>reduced</tt> (differentiable) : T</dt>
-<dd>Reduced output tensor.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8), tensor(bool)</dt>
-<dd>Constrain input and output types to numeric and Boolean tensors.</dd>
-</dl>
-
-### <a name="RegexFullMatch-20"></a>**RegexFullMatch-20**</a>
-
-  RegexFullMatch performs a full regex match on each element of the input tensor. If an element fully matches the regex pattern specified as an attribute, the corresponding element in the output is True and it is False otherwise. [RE2](https://github.com/google/re2/wiki/Syntax) regex syntax is used.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>pattern</tt> : string</dt>
-<dd>Regex pattern to match on. This must be valid RE2 syntax.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>Tensor with strings to match on.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>Tensor of bools indicating if each input string fully matches the regex pattern specified.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(string)</dt>
-<dd>Inputs must be UTF-8 strings</dd>
-<dt><tt>T2</tt> : tensor(bool)</dt>
-<dd>Outputs are bools and are True where there is a full regex match and False otherwise.</dd>
-</dl>
-
-### <a name="StringConcat-20"></a>**StringConcat-20**</a>
-
-  StringConcat concatenates string tensors elementwise (with NumPy-style broadcasting support)
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T</dt>
-<dd>Tensor to prepend in concatenation</dd>
-<dt><tt>Y</tt> (non-differentiable) : T</dt>
-<dd>Tensor to append in concatenation</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Z</tt> (non-differentiable) : T</dt>
-<dd>Concatenated string tensor</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(string)</dt>
-<dd>Inputs and outputs must be UTF-8 strings</dd>
-</dl>
-
-### <a name="StringSplit-20"></a>**StringSplit-20**</a>
-
-  StringSplit splits a string tensor's elements into substrings based on a delimiter attribute and a maxsplit attribute.
-
-  The first output of this operator is a tensor of strings representing the substrings from splitting each input string on the `delimiter` substring. This tensor has one additional rank compared to the input tensor in order to store the substrings for each input element (where the input tensor is not empty). Note that, in order to ensure the same number of elements are present in the final dimension, this tensor will pad empty strings as illustrated in the examples below. Consecutive delimiters are not grouped together and are deemed to delimit empty strings, except if the `delimiter` is unspecified or is the empty string (""). In the case where the `delimiter` is unspecified or the empty string, consecutive whitespace characters are regarded as a single separator and leading or trailing whitespace is removed in the output.
-
-  The second output tensor represents the number of substrings generated. `maxsplit` can be used to limit the number of splits performed - after the `maxsplit`th split if the string is not fully split, the trailing suffix of input string after the final split point is also added. For elements where fewer splits are possible than specified in `maxsplit`, it has no effect.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>delimiter</tt> : string</dt>
-<dd>Delimiter to split on. If left unset or set to the empty string (""), the input is split on consecutive whitespace.</dd>
-<dt><tt>maxsplit</tt> : int</dt>
-<dd>Maximum number of splits (from left to right). If left unset (or if the number of possible splits are less than maxsplit), it will make as many splits as possible. Note that the maximum possible number of substrings returned with `maxsplit` specified is `maxsplit+1` since the remaining suffix after the `maxsplit`th split is included in the output.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>Tensor of strings to split.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>Tensor of substrings representing the outcome of splitting the strings in the input on the delimiter. Note that to ensure the same number of elements are present in the final rank, this tensor will pad any necessary empty strings.</dd>
-<dt><tt>Z</tt> (non-differentiable) : T3</dt>
-<dd>The number of substrings generated for each input element.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(string)</dt>
-<dd>The input must be a UTF-8 string tensor</dd>
-<dt><tt>T2</tt> : tensor(string)</dt>
-<dd>Tensor of substrings.</dd>
-<dt><tt>T3</tt> : tensor(int64)</dt>
-<dd>The number of substrings generated.</dd>
-</dl>
-
 # ai.onnx.preview.training
 ## Version 1 of the 'ai.onnx.preview.training' operator set
 ### <a name="ai.onnx.preview.training.Adagrad-1"></a>**ai.onnx.preview.training.Adagrad-1**</a>
diff --git a/docs/Operators-ml.md b/docs/Operators-ml.md
index decdb54b316..fd4c759161c 100644
--- a/docs/Operators-ml.md
+++ b/docs/Operators-ml.md
@@ -18,7 +18,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#ai.onnx.ml.DictVectorizer">ai.onnx.ml.DictVectorizer</a>|<a href="Changelog-ml.md#ai.onnx.ml.DictVectorizer-1">1</a>|
 |<a href="#ai.onnx.ml.FeatureVectorizer">ai.onnx.ml.FeatureVectorizer</a>|<a href="Changelog-ml.md#ai.onnx.ml.FeatureVectorizer-1">1</a>|
 |<a href="#ai.onnx.ml.Imputer">ai.onnx.ml.Imputer</a>|<a href="Changelog-ml.md#ai.onnx.ml.Imputer-1">1</a>|
-|<a href="#ai.onnx.ml.LabelEncoder">ai.onnx.ml.LabelEncoder</a>|<a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-4">4</a>, <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-2">2</a>, <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-1">1</a>|
+|<a href="#ai.onnx.ml.LabelEncoder">ai.onnx.ml.LabelEncoder</a>|<a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-2">2</a>, <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-1">1</a>|
 |<a href="#ai.onnx.ml.LinearClassifier">ai.onnx.ml.LinearClassifier</a>|<a href="Changelog-ml.md#ai.onnx.ml.LinearClassifier-1">1</a>|
 |<a href="#ai.onnx.ml.LinearRegressor">ai.onnx.ml.LinearRegressor</a>|<a href="Changelog-ml.md#ai.onnx.ml.LinearRegressor-1">1</a>|
 |<a href="#ai.onnx.ml.Normalizer">ai.onnx.ml.Normalizer</a>|<a href="Changelog-ml.md#ai.onnx.ml.Normalizer-1">1</a>|
@@ -304,7 +304,7 @@ This version of the operator has been available since version 1 of the 'ai.onnx.
 ### <a name="ai.onnx.ml.FeatureVectorizer"></a><a name="ai.onnx.ml.featurevectorizer">**ai.onnx.ml.FeatureVectorizer**</a>
 
   Concatenates input tensors into one continuous output.<br>
-      All input shapes are 2-D and are concatenated along the second dimension. 1-D tensors are treated as [1,C].
+      All input shapes are 2-D and are concatenated along the second dimention. 1-D tensors are treated as [1,C].
       Inputs are copied to the output maintaining the order of the input arguments.<br>
       All inputs must be integers or floats, while the output will be all floating point values.
 
@@ -399,25 +399,23 @@ This version of the operator has been available since version 1 of the 'ai.onnx.
       would be mapped to the i-th value in the specified 'values_*' attribute. It
       implies that input's element type and the element type of the specified
       'keys_*' should be identical while the output type is identical to the
-      specified 'values_*' attribute. Note that the 'keys_*' and 'values_*' attributes
-      must have the same length. If an input element can not be found in the
+      specified 'values_*' attribute. If an input element can not be found in the
       specified 'keys_*' attribute, the 'default_*' that matches the specified
-      'values_*' attribute may be used as its output value. The type of the 'default_*'
-      attribute must match the 'values_*' attribute chosen. <br>
+      'values_*' attribute may be used as its output value.<br>
       Let's consider an example which maps a string tensor to an integer tensor.
       Assume and 'keys_strings' is ["Amy", "Sally"], 'values_int64s' is [5, 6],
       and 'default_int64' is '-1'.  The input ["Dori", "Amy", "Amy", "Sally",
       "Sally"] would be mapped to [-1, 5, 5, 6, 6].<br>
       Since this operator is an one-to-one mapping, its input and output shapes
       are the same. Notice that only one of 'keys_*'/'values_*' can be set.<br>
-      Float keys with value 'NaN' match any input 'NaN' value regardless of bit
-      value. If a key is repeated, the last key takes precedence.
+      For key look-up, bit-wise comparison is used so even a float NaN can be
+      mapped to a value in 'values_*' attribute.<br>
 
 #### Version
 
-This version of the operator has been available since version 4 of the 'ai.onnx.ml' operator set.
+This version of the operator has been available since version 2 of the 'ai.onnx.ml' operator set.
 
-Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-1">1</a>, <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-2">2</a>
+Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.LabelEncoder-1">1</a>
 
 #### Attributes
 
@@ -428,152 +426,44 @@ Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.LabelEncode
 <dd>An integer.</dd>
 <dt><tt>default_string</tt> : string (default is _Unused)</dt>
 <dd>A string.</dd>
-<dt><tt>default_tensor</tt> : tensor (default is {"_Unused"} if values_* has string type, {-1} if values_* has integral type, and {-0.f} if values_* has float type.)</dt>
-<dd>A default tensor.</dd>
 <dt><tt>keys_floats</tt> : list of floats</dt>
 <dd>A list of floats.</dd>
 <dt><tt>keys_int64s</tt> : list of ints</dt>
 <dd>A list of ints.</dd>
 <dt><tt>keys_strings</tt> : list of strings</dt>
-<dd>A list of strings.</dd>
-<dt><tt>keys_tensor</tt> : tensor</dt>
-<dd>Keys encoded as a 1D tensor. One and only one of 'keys_*'s should be set.</dd>
+<dd>A list of strings. One and only one of 'keys_*'s should be set.</dd>
 <dt><tt>values_floats</tt> : list of floats</dt>
 <dd>A list of floats.</dd>
 <dt><tt>values_int64s</tt> : list of ints</dt>
 <dd>A list of ints.</dd>
 <dt><tt>values_strings</tt> : list of strings</dt>
-<dd>A list of strings.</dd>
-<dt><tt>values_tensor</tt> : tensor</dt>
-<dd>Values encoded as a 1D tensor. One and only one of 'values_*'s should be set.</dd>
+<dd>A list of strings. One and only one of 'value_*'s should be set.</dd>
 </dl>
 
 #### Inputs
 
 <dl>
 <dt><tt>X</tt> : T1</dt>
-<dd>Input data. It must have the same element type as the keys_* attribute set.</dd>
+<dd>Input data. It can be either tensor or scalar.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>Y</tt> : T2</dt>
-<dd>Output data. This tensor's element type is based on the values_* attribute set.</dd>
+<dd>Output data.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(string), tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(string), tensor(int64), tensor(float)</dt>
 <dd>The input type is a tensor of any shape.</dd>
-<dt><tt>T2</tt> : tensor(string), tensor(int64), tensor(float), tensor(int32), tensor(int16), tensor(double)</dt>
+<dt><tt>T2</tt> : tensor(string), tensor(int64), tensor(float)</dt>
 <dd>Output type is determined by the specified 'values_*' attribute.</dd>
 </dl>
 
 
-#### Examples
-
-<details>
-<summary>string_int_label_encoder</summary>
-
-```python
-node = onnx.helper.make_node(
-    "LabelEncoder",
-    inputs=["X"],
-    outputs=["Y"],
-    domain="ai.onnx.ml",
-    keys_strings=["a", "b", "c"],
-    values_int64s=[0, 1, 2],
-    default_int64=42,
-)
-x = np.array(["a", "b", "d", "c", "g"]).astype(object)
-y = np.array([0, 1, 42, 2, 42]).astype(np.int64)
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_ai_onnx_ml_label_encoder_string_int",
-)
-
-node = onnx.helper.make_node(
-    "LabelEncoder",
-    inputs=["X"],
-    outputs=["Y"],
-    domain="ai.onnx.ml",
-    keys_strings=["a", "b", "c"],
-    values_int64s=[0, 1, 2],
-)
-x = np.array(["a", "b", "d", "c", "g"]).astype(object)
-y = np.array([0, 1, -1, 2, -1]).astype(np.int64)
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_ai_onnx_ml_label_encoder_string_int_no_default",
-)
-```
-
-</details>
-
-
-<details>
-<summary>tensor_based_label_encoder</summary>
-
-```python
-tensor_keys = make_tensor(
-    "keys_tensor", onnx.TensorProto.STRING, (3,), ["a", "b", "c"]
-)
-repeated_string_keys = ["a", "b", "c"]
-x = np.array(["a", "b", "d", "c", "g"]).astype(object)
-y = np.array([0, 1, 42, 2, 42]).astype(np.int16)
-
-node = onnx.helper.make_node(
-    "LabelEncoder",
-    inputs=["X"],
-    outputs=["Y"],
-    domain="ai.onnx.ml",
-    keys_tensor=tensor_keys,
-    values_tensor=make_tensor(
-        "values_tensor", onnx.TensorProto.INT16, (3,), [0, 1, 2]
-    ),
-    default_tensor=make_tensor(
-        "default_tensor", onnx.TensorProto.INT16, (1,), [42]
-    ),
-)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_ai_onnx_ml_label_encoder_tensor_mapping",
-)
-
-node = onnx.helper.make_node(
-    "LabelEncoder",
-    inputs=["X"],
-    outputs=["Y"],
-    domain="ai.onnx.ml",
-    keys_strings=repeated_string_keys,
-    values_tensor=make_tensor(
-        "values_tensor", onnx.TensorProto.INT16, (3,), [0, 1, 2]
-    ),
-    default_tensor=make_tensor(
-        "default_tensor", onnx.TensorProto.INT16, (1,), [42]
-    ),
-)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_ai_onnx_ml_label_encoder_tensor_value_only_mapping",
-)
-```
-
-</details>
-
-
 ### <a name="ai.onnx.ml.LinearClassifier"></a><a name="ai.onnx.ml.linearclassifier">**ai.onnx.ml.LinearClassifier**</a>
 
   Linear classifier
@@ -968,7 +858,7 @@ Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.TreeEnsembl
 <dt><tt>nodes_hitrates_as_tensor</tt> : tensor</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.<br>This attribute may be left undefined, and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
@@ -1038,9 +928,9 @@ Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.TreeEnsembl
 <dt><tt>aggregate_function</tt> : string (default is SUM)</dt>
 <dd>Defines how to aggregate leaf values within a target. <br>One of 'AVERAGE,' 'SUM,' 'MIN,' 'MAX.'</dd>
 <dt><tt>base_values</tt> : list of floats</dt>
-<dd>Base values for regression, added to final prediction after applying aggregate_function; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
+<dd>Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
 <dt><tt>base_values_as_tensor</tt> : tensor</dt>
-<dd>Base values for regression, added to final prediction after applying aggregate_function; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
+<dd>Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)</dd>
 <dt><tt>n_targets</tt> : int</dt>
 <dd>The total number of targets.</dd>
 <dt><tt>nodes_falsenodeids</tt> : list of ints</dt>
@@ -1052,7 +942,7 @@ Other versions of this operator: <a href="Changelog-ml.md#ai.onnx.ml.TreeEnsembl
 <dt><tt>nodes_hitrates_as_tensor</tt> : tensor</dt>
 <dd>Popularity of each node, used for performance and may be omitted.</dd>
 <dt><tt>nodes_missing_value_tracks_true</tt> : list of ints</dt>
-<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the default value is false (0) for all nodes.</dd>
+<dd>For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.<br>This attribute may be left undefined and the defalt value is false (0) for all nodes.</dd>
 <dt><tt>nodes_modes</tt> : list of strings</dt>
 <dd>The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'</dd>
 <dt><tt>nodes_nodeids</tt> : list of ints</dt>
diff --git a/docs/Operators.md b/docs/Operators.md
index 56ac3118452..fcb9165fdd9 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -36,14 +36,14 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Concat">Concat</a>|<a href="Changelog.md#Concat-13">13</a>, <a href="Changelog.md#Concat-11">11</a>, <a href="Changelog.md#Concat-4">4</a>, <a href="Changelog.md#Concat-1">1</a>|
 |<a href="#ConcatFromSequence">ConcatFromSequence</a>|<a href="Changelog.md#ConcatFromSequence-11">11</a>|
 |<a href="#Constant">Constant</a>|<a href="Changelog.md#Constant-19">19</a>, <a href="Changelog.md#Constant-13">13</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-1">1</a>|
-|<a href="#ConstantOfShape">ConstantOfShape</a>|<a href="Changelog.md#ConstantOfShape-20">20</a>, <a href="Changelog.md#ConstantOfShape-9">9</a>|
+|<a href="#ConstantOfShape">ConstantOfShape</a>|<a href="Changelog.md#ConstantOfShape-9">9</a>|
 |<a href="#Conv">Conv</a>|<a href="Changelog.md#Conv-11">11</a>, <a href="Changelog.md#Conv-1">1</a>|
 |<a href="#ConvInteger">ConvInteger</a>|<a href="Changelog.md#ConvInteger-10">10</a>|
 |<a href="#ConvTranspose">ConvTranspose</a>|<a href="Changelog.md#ConvTranspose-11">11</a>, <a href="Changelog.md#ConvTranspose-1">1</a>|
 |<a href="#Cos">Cos</a>|<a href="Changelog.md#Cos-7">7</a>|
 |<a href="#Cosh">Cosh</a>|<a href="Changelog.md#Cosh-9">9</a>|
 |<a href="#CumSum">CumSum</a>|<a href="Changelog.md#CumSum-14">14</a>, <a href="Changelog.md#CumSum-11">11</a>|
-|<a href="#DFT">DFT</a>|<a href="Changelog.md#DFT-20">20</a>, <a href="Changelog.md#DFT-17">17</a>|
+|<a href="#DFT">DFT</a>|<a href="Changelog.md#DFT-17">17</a>|
 |<a href="#DeformConv">DeformConv</a>|<a href="Changelog.md#DeformConv-19">19</a>|
 |<a href="#DepthToSpace">DepthToSpace</a>|<a href="Changelog.md#DepthToSpace-13">13</a>, <a href="Changelog.md#DepthToSpace-11">11</a>, <a href="Changelog.md#DepthToSpace-1">1</a>|
 |<a href="#DequantizeLinear">DequantizeLinear</a>|<a href="Changelog.md#DequantizeLinear-19">19</a>, <a href="Changelog.md#DequantizeLinear-13">13</a>, <a href="Changelog.md#DequantizeLinear-10">10</a>|
@@ -67,14 +67,13 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#GlobalLpPool">GlobalLpPool</a>|<a href="Changelog.md#GlobalLpPool-2">2</a>, <a href="Changelog.md#GlobalLpPool-1">1</a>|
 |<a href="#GlobalMaxPool">GlobalMaxPool</a>|<a href="Changelog.md#GlobalMaxPool-1">1</a>|
 |<a href="#Greater">Greater</a>|<a href="Changelog.md#Greater-13">13</a>, <a href="Changelog.md#Greater-9">9</a>, <a href="Changelog.md#Greater-7">7</a>, <a href="Changelog.md#Greater-1">1</a>|
-|<a href="#GridSample">GridSample</a>|<a href="Changelog.md#GridSample-20">20</a>, <a href="Changelog.md#GridSample-16">16</a>|
+|<a href="#GridSample">GridSample</a>|<a href="Changelog.md#GridSample-16">16</a>|
 |<a href="#Hardmax">Hardmax</a>|<a href="Changelog.md#Hardmax-13">13</a>, <a href="Changelog.md#Hardmax-11">11</a>, <a href="Changelog.md#Hardmax-1">1</a>|
 |<a href="#Identity">Identity</a>|<a href="Changelog.md#Identity-19">19</a>, <a href="Changelog.md#Identity-16">16</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-1">1</a>|
 |<a href="#If">If</a>|<a href="Changelog.md#If-19">19</a>, <a href="Changelog.md#If-16">16</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-1">1</a>|
-|<a href="#ImageDecoder">ImageDecoder</a>|<a href="Changelog.md#ImageDecoder-20">20</a>|
 |<a href="#InstanceNormalization">InstanceNormalization</a>|<a href="Changelog.md#InstanceNormalization-6">6</a>, <a href="Changelog.md#InstanceNormalization-1">1</a>|
-|<a href="#IsInf">IsInf</a>|<a href="Changelog.md#IsInf-20">20</a>, <a href="Changelog.md#IsInf-10">10</a>|
-|<a href="#IsNaN">IsNaN</a>|<a href="Changelog.md#IsNaN-20">20</a>, <a href="Changelog.md#IsNaN-13">13</a>, <a href="Changelog.md#IsNaN-9">9</a>|
+|<a href="#IsInf">IsInf</a>|<a href="Changelog.md#IsInf-10">10</a>|
+|<a href="#IsNaN">IsNaN</a>|<a href="Changelog.md#IsNaN-13">13</a>, <a href="Changelog.md#IsNaN-9">9</a>|
 |<a href="#LRN">LRN</a>|<a href="Changelog.md#LRN-13">13</a>, <a href="Changelog.md#LRN-1">1</a>|
 |<a href="#LSTM">LSTM</a>|<a href="Changelog.md#LSTM-14">14</a>, <a href="Changelog.md#LSTM-7">7</a>, <a href="Changelog.md#LSTM-1">1</a>|
 |<a href="#Less">Less</a>|<a href="Changelog.md#Less-13">13</a>, <a href="Changelog.md#Less-9">9</a>, <a href="Changelog.md#Less-7">7</a>, <a href="Changelog.md#Less-1">1</a>|
@@ -114,12 +113,11 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#RandomUniform">RandomUniform</a>|<a href="Changelog.md#RandomUniform-1">1</a>|
 |<a href="#RandomUniformLike">RandomUniformLike</a>|<a href="Changelog.md#RandomUniformLike-1">1</a>|
 |<a href="#Reciprocal">Reciprocal</a>|<a href="Changelog.md#Reciprocal-13">13</a>, <a href="Changelog.md#Reciprocal-6">6</a>, <a href="Changelog.md#Reciprocal-1">1</a>|
-|<a href="#ReduceMax">ReduceMax</a>|<a href="Changelog.md#ReduceMax-20">20</a>, <a href="Changelog.md#ReduceMax-18">18</a>, <a href="Changelog.md#ReduceMax-13">13</a>, <a href="Changelog.md#ReduceMax-12">12</a>, <a href="Changelog.md#ReduceMax-11">11</a>, <a href="Changelog.md#ReduceMax-1">1</a>|
+|<a href="#ReduceMax">ReduceMax</a>|<a href="Changelog.md#ReduceMax-18">18</a>, <a href="Changelog.md#ReduceMax-13">13</a>, <a href="Changelog.md#ReduceMax-12">12</a>, <a href="Changelog.md#ReduceMax-11">11</a>, <a href="Changelog.md#ReduceMax-1">1</a>|
 |<a href="#ReduceMean">ReduceMean</a>|<a href="Changelog.md#ReduceMean-18">18</a>, <a href="Changelog.md#ReduceMean-13">13</a>, <a href="Changelog.md#ReduceMean-11">11</a>, <a href="Changelog.md#ReduceMean-1">1</a>|
-|<a href="#ReduceMin">ReduceMin</a>|<a href="Changelog.md#ReduceMin-20">20</a>, <a href="Changelog.md#ReduceMin-18">18</a>, <a href="Changelog.md#ReduceMin-13">13</a>, <a href="Changelog.md#ReduceMin-12">12</a>, <a href="Changelog.md#ReduceMin-11">11</a>, <a href="Changelog.md#ReduceMin-1">1</a>|
+|<a href="#ReduceMin">ReduceMin</a>|<a href="Changelog.md#ReduceMin-18">18</a>, <a href="Changelog.md#ReduceMin-13">13</a>, <a href="Changelog.md#ReduceMin-12">12</a>, <a href="Changelog.md#ReduceMin-11">11</a>, <a href="Changelog.md#ReduceMin-1">1</a>|
 |<a href="#ReduceProd">ReduceProd</a>|<a href="Changelog.md#ReduceProd-18">18</a>, <a href="Changelog.md#ReduceProd-13">13</a>, <a href="Changelog.md#ReduceProd-11">11</a>, <a href="Changelog.md#ReduceProd-1">1</a>|
 |<a href="#ReduceSum">ReduceSum</a>|<a href="Changelog.md#ReduceSum-13">13</a>, <a href="Changelog.md#ReduceSum-11">11</a>, <a href="Changelog.md#ReduceSum-1">1</a>|
-|<a href="#RegexFullMatch">RegexFullMatch</a>|<a href="Changelog.md#RegexFullMatch-20">20</a>|
 |<a href="#Reshape">Reshape</a>|<a href="Changelog.md#Reshape-19">19</a>, <a href="Changelog.md#Reshape-14">14</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-1">1</a>|
 |<a href="#Resize">Resize</a>|<a href="Changelog.md#Resize-19">19</a>, <a href="Changelog.md#Resize-18">18</a>, <a href="Changelog.md#Resize-13">13</a>, <a href="Changelog.md#Resize-11">11</a>, <a href="Changelog.md#Resize-10">10</a>|
 |<a href="#ReverseSequence">ReverseSequence</a>|<a href="Changelog.md#ReverseSequence-10">10</a>|
@@ -148,9 +146,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#SplitToSequence">SplitToSequence</a>|<a href="Changelog.md#SplitToSequence-11">11</a>|
 |<a href="#Sqrt">Sqrt</a>|<a href="Changelog.md#Sqrt-13">13</a>, <a href="Changelog.md#Sqrt-6">6</a>, <a href="Changelog.md#Sqrt-1">1</a>|
 |<a href="#Squeeze">Squeeze</a>|<a href="Changelog.md#Squeeze-13">13</a>, <a href="Changelog.md#Squeeze-11">11</a>, <a href="Changelog.md#Squeeze-1">1</a>|
-|<a href="#StringConcat">StringConcat</a>|<a href="Changelog.md#StringConcat-20">20</a>|
 |<a href="#StringNormalizer">StringNormalizer</a>|<a href="Changelog.md#StringNormalizer-10">10</a>|
-|<a href="#StringSplit">StringSplit</a>|<a href="Changelog.md#StringSplit-20">20</a>|
 |<a href="#Sub">Sub</a>|<a href="Changelog.md#Sub-14">14</a>, <a href="Changelog.md#Sub-13">13</a>, <a href="Changelog.md#Sub-7">7</a>, <a href="Changelog.md#Sub-6">6</a>, <a href="Changelog.md#Sub-1">1</a>|
 |<a href="#Sum">Sum</a>|<a href="Changelog.md#Sum-13">13</a>, <a href="Changelog.md#Sum-8">8</a>, <a href="Changelog.md#Sum-6">6</a>, <a href="Changelog.md#Sum-1">1</a>|
 |<a href="#Tan">Tan</a>|<a href="Changelog.md#Tan-7">7</a>|
@@ -166,7 +162,6 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Where">Where</a>|<a href="Changelog.md#Where-16">16</a>, <a href="Changelog.md#Where-9">9</a>|
 |<a href="#Xor">Xor</a>|<a href="Changelog.md#Xor-7">7</a>, <a href="Changelog.md#Xor-1">1</a>|
 |**Function**|**Since version**|**Function version**|
-|<a href="#AffineGrid">AffineGrid</a>|<a href="Changelog.md#AffineGrid-20">20</a>|20|
 |<a href="#Bernoulli">Bernoulli</a>|<a href="Changelog.md#Bernoulli-15">15</a>|15|
 |<a href="#BlackmanWindow">BlackmanWindow</a>|<a href="Changelog.md#BlackmanWindow-17">17</a>|17|
 |<a href="#CastLike">CastLike</a>|<a href="Changelog.md#CastLike-19">19</a>, <a href="Changelog.md#CastLike-15">15</a>|19|
@@ -175,7 +170,6 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Clip">Clip</a>|<a href="Changelog.md#Clip-13">13</a>, <a href="Changelog.md#Clip-12">12</a>, <a href="Changelog.md#Clip-11">11</a>, <a href="Changelog.md#Clip-6">6</a>, <a href="Changelog.md#Clip-1">1</a>|13|
 |<a href="#DynamicQuantizeLinear">DynamicQuantizeLinear</a>|<a href="Changelog.md#DynamicQuantizeLinear-11">11</a>|11|
 |<a href="#Elu">Elu</a>|<a href="Changelog.md#Elu-6">6</a>, <a href="Changelog.md#Elu-1">1</a>|18|
-|<a href="#Gelu">Gelu</a>|<a href="Changelog.md#Gelu-20">20</a>|20|
 |<a href="#GreaterOrEqual">GreaterOrEqual</a>|<a href="Changelog.md#GreaterOrEqual-16">16</a>, <a href="Changelog.md#GreaterOrEqual-12">12</a>|16|
 |<a href="#GroupNormalization">GroupNormalization</a>|<a href="Changelog.md#GroupNormalization-18">18</a>|18|
 |<a href="#HammingWindow">HammingWindow</a>|<a href="Changelog.md#HammingWindow-17">17</a>|17|
@@ -219,7 +213,7 @@ For an operator input/output's differentiability, it can be differentiable,
 ### <a name="Abs"></a><a name="abs">**Abs**</a>
 
   Absolute takes one input data (Tensor<T>) and produces one output data
-  (Tensor<T>) where absolute value, y = abs(x), is applied to
+  (Tensor<T>) where the absolute is, y = abs(x), is applied to
   the tensor elementwise.
 
 #### Version
@@ -491,139 +485,6 @@ expect(node, inputs=[x, y], outputs=[x + y], name="test_add_uint8")
 </details>
 
 
-### <a name="AffineGrid"></a><a name="affinegrid">**AffineGrid**</a>
-
-  Generates a 2D or 3D flow field (sampling grid), given a batch of affine matrices theta
-  (https://pytorch.org/docs/stable/generated/torch.nn.functional.affine_grid.html).
-  An affine matrix `theta` is applied to a position tensor represented in its homogeneous expression. Here is an example in 3D:
-  ```
-  [r00, r01, r02, t0]   [x]   [x']
-  [r10, r11, r12, t1] * [y] = [y']
-  [r20, r21, r22, t2]   [z]   [z']
-  [0,   0,   0,   1 ]   [1]   [1 ]
-  ```
-  where `(x, y, z)` is the position in the original space, `(x', y', z')` is the position in the output space.
-  The last row is always `[0, 0, 0, 1]` and is not stored in the affine matrix. Therefore we have `theta` of shape `(N, 2, 3)` for 2D or `(N, 3, 4)` for 3D.
-
-  Input `size` is used to define grid of positions evenly spaced in the original 2D or 3D space, with dimensions ranging from `-1` to `1`.
-  The output `grid` contains positions in the output space.
-
-  When `align_corners=1`, consider `-1` and `1` to refer to the centers of the corner pixels (mark `v` in illustration).
-  ```
-  v            v            v            v
-  |-------------------|------------------|
-  -1                  0                  1
-  ```
-  When `align_corners=0`, consider `-1` and `1` to refer to the outer edge of the corner pixels.
-  ```
-      v        v         v         v
-  |------------------|-------------------|
-  -1                 0                   1
-  ```
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>align_corners</tt> : int (default is 0)</dt>
-<dd>if align_corners=1, consider -1 and 1 to refer to the centers of the corner pixels. if align_corners=0, consider -1 and 1 to refer to the outer edge the corner pixels.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>theta</tt> (non-differentiable) : T1</dt>
-<dd>input batch of affine matrices with shape (N, 2, 3) for 2D or (N, 3, 4) for 3D</dd>
-<dt><tt>size</tt> (non-differentiable) : T2</dt>
-<dd>the target output image size (N, C, H, W) for 2D or (N, C, D, H, W) for 3D</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>grid</tt> (differentiable) : T1</dt>
-<dd>output tensor of shape (N, H, W, 2) of 2D sample coordinates or (N, D, H, W, 3) of 3D sample coordinates.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
-<dd>Constrain grid types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(int64)</dt>
-<dd>Constrain size's type to int64 tensors.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>2d_no_reference_evaluator</summary>
-
-```python
-theta_2d = create_theta_2d()
-N, C, H, W = len(theta_2d), 3, 5, 6
-data_size = (H, W)
-for align_corners in (0, 1):
-    node = onnx.helper.make_node(
-        "AffineGrid",
-        inputs=["theta", "size"],
-        outputs=["grid"],
-        align_corners=align_corners,
-    )
-
-    original_grid = construct_original_grid(data_size, align_corners)
-    grid = apply_affine_transform(theta_2d, original_grid)
-
-    test_name = "test_affine_grid_2d"
-    if align_corners == 1:
-        test_name += "_align_corners"
-    expect(
-        node,
-        inputs=[theta_2d, np.array([N, C, H, W], dtype=np.int64)],
-        outputs=[grid],
-        name=test_name,
-    )
-```
-
-</details>
-
-
-<details>
-<summary>3d_no_reference_evaluator</summary>
-
-```python
-theta_3d = create_theta_3d()
-N, C, D, H, W = len(theta_3d), 3, 4, 5, 6
-data_size = (D, H, W)
-for align_corners in (0, 1):
-    node = onnx.helper.make_node(
-        "AffineGrid",
-        inputs=["theta", "size"],
-        outputs=["grid"],
-        align_corners=align_corners,
-    )
-
-    original_grid = construct_original_grid(data_size, align_corners)
-    grid = apply_affine_transform(theta_3d, original_grid)
-
-    test_name = "test_affine_grid_3d"
-    if align_corners == 1:
-        test_name += "_align_corners"
-    expect(
-        node,
-        inputs=[theta_3d, np.array([N, C, D, H, W], dtype=np.int64)],
-        outputs=[grid],
-        name=test_name,
-    )
-```
-
-</details>
-
-
 ### <a name="And"></a><a name="and">**And**</a>
 
   Returns the tensor resulted from performing the `and` logical operation
@@ -1613,28 +1474,21 @@ expect(node, inputs=[x], outputs=[y], name="test_atanh")
    the tensor according to kernel sizes, stride sizes, and pad lengths.
    average pooling consisting of computing the average on all values of a
    subset of the input tensor according to the kernel size and downsampling the
-   data into the output tensor Y for further processing. The output spatial shape is calculated differently
-   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   data into the output tensor Y for further processing. The output spatial shape will be following:
    ```
-   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
    or
    ```
-   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+   if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
 
-   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
    ```
    VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
    SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
    ```
-   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
-   ```
-   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
-   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
-   ```
    And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
    ```
    pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -1707,14 +1561,11 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = [2]
 strides = [1]
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0], "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_1d_default")
 ```
@@ -1774,14 +1625,11 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_2d_default")
 ```
@@ -1853,17 +1701,17 @@ pad_bottom = 2
 pad_top = 2
 pad_right = 2
 pad_left = 2
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-out_shape, pads = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides, ceil_mode=False
+pad_shape = [pad_top + pad_bottom, pad_left + pad_right]
+out_shape = get_output_shape(
+    "VALID", np.add(x_shape[2:], pad_shape), kernel_shape, strides
 )
 padded = np.pad(
     x,
-    ((0, 0), (0, 0), (pads[0], pads[2]), (pads[1], pads[3])),
+    ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right)),
     mode="constant",
     constant_values=np.nan,
 )
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_2d_pads")
 ```
@@ -1890,20 +1738,19 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 28, 28).astype(np.float32)
 x_shape = np.shape(x)
-dilations = (1, 1)
 kernel_shape = (3, 3)
 strides = (1, 1)
 pad_bottom = 2
 pad_top = 2
 pad_right = 2
 pad_left = 2
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-out_shape, pads = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides, dilations, ceil_mode=False
+pad_shape = [pad_top + pad_bottom, pad_left + pad_right]
+out_shape = get_output_shape(
+    "VALID", np.add(x_shape[2:], pad_shape), kernel_shape, strides
 )
 padded = np.pad(
     x,
-    ((0, 0), (0, 0), (pads[0], pads[2]), (pads[1], pads[3])),
+    ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right)),
     mode="constant",
     constant_values=0,
 )
@@ -1913,8 +1760,8 @@ y = pool(
     kernel_shape,
     strides,
     out_shape,
+    pad_shape,
     "AVG",
-    pads,
     count_include_pad=1,
 )
 
@@ -2141,9 +1988,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_LOWER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_LOWER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_LOWER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -2157,8 +2002,7 @@ padded = np.pad(
     mode="constant",
     constant_values=np.nan,
 )
-pads = (pad_top, pad_left, pad_bottom, pad_right)
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_2d_same_lower")
 ```
@@ -2186,9 +2030,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_UPPER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_UPPER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_UPPER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -2202,8 +2044,7 @@ padded = np.pad(
     mode="constant",
     constant_values=np.nan,
 )
-pads = (pad_top, pad_left, pad_bottom, pad_right)
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_2d_same_upper")
 ```
@@ -2230,11 +2071,9 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (5, 5)
 strides = (3, 3)
-out_shape, pads = get_output_shape_explicit_padding(
-    None, x_shape[2:], kernel_shape, strides, ceil_mode=False
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_2d_strides")
 ```
@@ -2258,14 +2097,11 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = [2, 2, 2]
 strides = [1, 1, 1]
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "AVG")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0, 0, 0], "AVG")
 
 expect(node, inputs=[x], outputs=[y], name="test_averagepool_3d_default")
 ```
@@ -2273,131 +2109,6 @@ expect(node, inputs=[x], outputs=[y], name="test_averagepool_3d_default")
 </details>
 
 
-<details>
-<summary>averagepool_3d_dilations</summary>
-
-```python
-"""
-input_shape: [1, 1, 4, 4]
-output_shape: [1, 1, 2, 2]
-"""
-node = onnx.helper.make_node(
-    "AveragePool",
-    inputs=["x"],
-    outputs=["y"],
-    kernel_shape=[2, 2, 2],
-    strides=[1, 1, 1],
-    dilations=[2, 2, 2],
-    ceil_mode=True,
-)
-
-# input shape: [1, 1, 4, 4, 4]
-x = np.array(
-    [
-        [
-            [
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-            ]
-        ]
-    ]
-).astype(np.float32)
-
-y = np.array([[[[[6, 7], [10, 11]], [[6, 7], [10, 11]]]]]).astype(np.float32)
-
-expect(
-    node, inputs=[x], outputs=[y], name="test_averagepool_3d_dilations_small"
-)
-```
-
-</details>
-
-
-<details>
-<summary>averagepool_3d_dilations_large</summary>
-
-```python
-x_shape = (32, 32, 32)
-dilations = (2, 2, 2)
-kernel_shape = (5, 5, 5)
-strides = (3, 3, 3)
-count_include_pad = 0
-
-for count_include_pad in (0, 1):
-    for ceil_mode in (True, False):
-        node = onnx.helper.make_node(
-            "AveragePool",
-            inputs=["x"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            strides=strides,
-            dilations=dilations,
-            count_include_pad=count_include_pad,
-            ceil_mode=ceil_mode,
-        )
-
-        x = np.random.randn(1, 1, *x_shape).astype(np.float32)
-        out_shape, pads = get_output_shape_explicit_padding(
-            None,
-            x_shape,
-            kernel_shape,
-            strides,
-            dilations=dilations,
-            ceil_mode=ceil_mode,
-        )
-        padded = np.pad(
-            x,
-            (
-                (0, 0),
-                (0, 0),
-                (pads[0], pads[3]),
-                (pads[1], pads[4]),
-                (pads[2], pads[5]),
-            ),
-            mode="constant",
-            constant_values=0 if count_include_pad == 1 else np.nan,
-        )
-        y = pool(
-            padded,
-            (1, 1, *x_shape),
-            kernel_shape,
-            strides,
-            out_shape,
-            "AVG",
-            pads=pads,
-            dilations=dilations,
-            count_include_pad=count_include_pad,
-        )
-
-        test_name = f"test_averagepool_3d_dilations_large_count_include_pad_is_{count_include_pad}_ceil_mode_is_{ceil_mode}"
-        expect(node, inputs=[x], outputs=[y], name=test_name)
-```
-
-</details>
-
-
 ### <a name="BatchNormalization"></a><a name="batchnormalization">**BatchNormalization**</a>
 
   Carries out batch normalization as described in the paper
@@ -3296,8 +3007,8 @@ a0 = 0.42
 a1 = -0.5
 a2 = 0.08
 y = a0
-y += a1 * np.cos(2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / size)
-y += a2 * np.cos(4 * np.pi * np.arange(0, size, 1, dtype=np.float32) / size)
+y += a1 * np.cos(2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / size)
+y += a2 * np.cos(4 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / size)
 expect(node, inputs=[size], outputs=[y], name="test_blackmanwindow")
 
 # Test symmetric window
@@ -3310,10 +3021,10 @@ a1 = -0.5
 a2 = 0.08
 y = a0
 y += a1 * np.cos(
-    2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
+    2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
 )
 y += a2 * np.cos(
-    4 * np.pi * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
+    4 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
 )
 expect(node, inputs=[size], outputs=[y], name="test_blackmanwindow_symmetric")
 ```
@@ -4029,16 +3740,12 @@ for from_type, to_type in test_cases:
         outputs=["output"],
     )
     if input_type_proto and output_type_proto:
-        like_type_proto = onnx.helper.make_tensor_type_proto(
-            output_type_proto.tensor_type.elem_type, like.shape
-        )
-
         expect(
             node,
             inputs=[input, like],
             outputs=[output],
             name="test_castlike_" + from_type + "_to_" + to_type,
-            input_type_protos=[input_type_proto, like_type_proto],
+            input_type_protos=[input_type_proto, output_type_proto],
             output_type_protos=[output_type_proto],
         )
     else:
@@ -5219,12 +4926,12 @@ for test_case, values_ in test_cases.items():
     for i in range(len(values[0].shape)):
         in_args = ["value" + str(k) for k in range(len(values))]
         node = onnx.helper.make_node(
-            "Concat", inputs=list(in_args), outputs=["output"], axis=i
+            "Concat", inputs=[s for s in in_args], outputs=["output"], axis=i
         )
         output = np.concatenate(values, i)
         expect(
             node,
-            inputs=list(values),
+            inputs=[v for v in values],
             outputs=[output],
             name="test_concat_" + test_case + "_axis_" + str(i),
         )
@@ -5232,12 +4939,12 @@ for test_case, values_ in test_cases.items():
     for i in range(-len(values[0].shape), 0):
         in_args = ["value" + str(k) for k in range(len(values))]
         node = onnx.helper.make_node(
-            "Concat", inputs=list(in_args), outputs=["output"], axis=i
+            "Concat", inputs=[s for s in in_args], outputs=["output"], axis=i
         )
         output = np.concatenate(values, i)
         expect(
             node,
-            inputs=list(values),
+            inputs=[v for v in values],
             outputs=[output],
             name="test_concat_" + test_case + "_axis_negative_" + str(abs(i)),
         )
@@ -5371,9 +5078,7 @@ expect(node, inputs=[], outputs=[values], name="test_constant")
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-Other versions of this operator: <a href="Changelog.md#ConstantOfShape-9">9</a>
+This version of the operator has been available since version 9 of the default ONNX operator set.
 
 #### Attributes
 
@@ -5401,7 +5106,7 @@ Other versions of this operator: <a href="Changelog.md#ConstantOfShape-9">9</a>
 <dl>
 <dt><tt>T1</tt> : tensor(int64)</dt>
 <dd>Constrain input types.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool)</dt>
 <dd>Constrain output types to be numerics.</dd>
 </dl>
 
@@ -5961,7 +5666,7 @@ Other versions of this operator: <a href="Changelog.md#ConvTranspose-1">1</a>
 <dt><tt>output_padding</tt> : list of ints</dt>
 <dd>Additional elements added to the side with higher coordinate indices in the output. Each padding value in "output_padding" must be less than the corresponding stride/dilation dimension. By default, this attribute is a zero vector. Note that this attribute doesn't directly affect the computed output values. It only controls the selection of the computed values, so changing this attribute only adds or removes output elements. If "output_shape" is explicitly provided, "output_padding" does not contribute additional size to "output_shape" but participates in the computation of the needed padding amount. This is also called adjs or adjustment in some frameworks.</dd>
 <dt><tt>output_shape</tt> : list of ints</dt>
-<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads. Note that the output_shape attribute value should not include dimensions for batch size and channels, which are automatically inferred.</dd>
+<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads</dd>
 <dt><tt>pads</tt> : list of ints</dt>
 <dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
 <dt><tt>strides</tt> : list of ints</dt>
@@ -6718,64 +6423,46 @@ expect(node, inputs=[x, axis], outputs=[y], name="test_cumsum_2d_negative_axis")
 
 ### <a name="DFT"></a><a name="dft">**DFT**</a>
 
-  Computes the discrete Fourier Transform (DFT) of the input.
-
-  Assuming the input has shape `[M, N]`, where `N` is the dimension over which the
-  DFT is computed and `M` denotes the conceptual "all other dimensions,"
-  the DFT `y[m, k]` of shape `[M, N]` is defined as
-
-  $$y[m, k] = \sum_{n=0}^{N-1} e^{-2 \pi j \frac{k n}{N} } x[m, n] ,$$
-
-  and the inverse transform is defined as
-
-  $$x[m, n] = \frac{1}{N} \sum_{k=0}^{N-1} e^{2 \pi j \frac{k n}{N} } y[m, k] ,$$
-
-  where $j$ is the imaginary unit.
-
-  The actual shape of the output is specified in the "output" section.
-
-  Reference: https://docs.scipy.org/doc/scipy/tutorial/fft.html
+  Computes the discrete Fourier transform of input.
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-Other versions of this operator: <a href="Changelog.md#DFT-17">17</a>
+This version of the operator has been available since version 17 of the default ONNX operator set.
 
 #### Attributes
 
 <dl>
+<dt><tt>axis</tt> : int (default is 1)</dt>
+<dd>The axis on which to perform the DFT. By default this value is set to 1, which corresponds to the first dimension after the batch index.</dd>
 <dt><tt>inverse</tt> : int (default is 0)</dt>
-<dd>Whether to perform the inverse discrete Fourier Transform. Default is 0, which corresponds to `false`.</dd>
+<dd>Whether to perform the inverse discrete fourier transform. By default this value is set to 0, which corresponds to false.</dd>
 <dt><tt>onesided</tt> : int (default is 0)</dt>
-<dd>If `onesided` is `1` and input is real, only values for `k` in `[0, 1, 2, ..., floor(n_fft/2) + 1]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., `X[m, k] = X[m, n_fft-k]*`, where `m` denotes "all other dimensions" DFT was not applied on. If the input tensor is complex, onesided output is not possible. Value can be `0` or `1`. Default is `0`.</dd>
+<dd>If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if the input or window tensors are complex, then onesided output is not possible. Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT). When invoked with real or complex valued input, the default value is 0. Values can be 0 or 1.</dd>
 </dl>
 
-#### Inputs (1 - 3)
+#### Inputs (1 - 2)
 
 <dl>
 <dt><tt>input</tt> (non-differentiable) : T1</dt>
-<dd>For real input, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][1]`. For complex input, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][2]`. The final dimension represents the real and imaginary parts of the value in that order.</dd>
+<dd>For real input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first dimension is the batch dimension. The following N dimentions correspond to the signal's dimensions. The final dimension represents the real and imaginary parts of the value in that order.</dd>
 <dt><tt>dft_length</tt> (optional, non-differentiable) : T2</dt>
-<dd>The length of the signal as a scalar. If greater than the axis dimension, the signal will be zero-padded up to `dft_length`. If less than the axis dimension, only the first `dft_length` values will be used as the signal. </dd>
-<dt><tt>axis</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>The axis as a scalar on which to perform the DFT. Default is `-2` (last signal axis). Negative value means counting dimensions from the back. Accepted range is $[-r, -2] \cup [0, r-2]$ where `r = rank(input)`. The last dimension is for representing complex numbers and thus is an invalid axis.</dd>
+<dd>The length of the signal.If greater than the axis dimension, the signal will be zero-padded up to dft_length. If less than the axis dimension, only the first dft_length values will be used as the signal. It's an optional value. </dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>output</tt> : T1</dt>
-<dd>The Fourier Transform of the input vector. If `onesided` is `0`, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][2]`. If `axis=0` and `onesided` is `1`, the following shape is expected: `[floor(signal_dim0/2)+1][signal_dim1][signal_dim2]...[signal_dimN][2]`. If `axis=1` and `onesided` is `1`, the following shape is expected: `[signal_dim0][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]`. If `axis=N` and `onesided` is `1`, the following shape is expected: `[signal_dim0][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]`. The `signal_dim` at the specified `axis` is equal to the `dft_length`.</dd>
+<dd>The Fourier Transform of the input vector.If onesided is 0, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. If axis=1 and onesided is 1, the following shape is expected: [batch_idx][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]. If axis=2 and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][floor(signal_dim2/2)+1]...[signal_dimN][2]. If axis=N and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]. The signal_dim at the specified axis is equal to the dft_length.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(int32), tensor(int64)</dt>
-<dd>Constrain scalar length types to integers.</dd>
+<dd>Constrain scalar length types to int64_t.</dd>
 </dl>
 
 
@@ -6784,43 +6471,6 @@ Other versions of this operator: <a href="Changelog.md#DFT-17">17</a>
 <details>
 <summary>dft</summary>
 
-```python
-node = onnx.helper.make_node("DFT", inputs=["x", "", "axis"], outputs=["y"])
-x = np.arange(0, 100).reshape(10, 10).astype(np.float32)
-axis = np.array(1, dtype=np.int64)
-y = np.fft.fft(x, axis=0)
-
-x = x.reshape(1, 10, 10, 1)
-y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(node, inputs=[x, axis], outputs=[y], name="test_dft")
-
-node = onnx.helper.make_node("DFT", inputs=["x", "", "axis"], outputs=["y"])
-x = np.arange(0, 100).reshape(10, 10).astype(np.float32)
-axis = np.array(2, dtype=np.int64)
-y = np.fft.fft(x, axis=1)
-
-x = x.reshape(1, 10, 10, 1)
-y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(node, inputs=[x, axis], outputs=[y], name="test_dft_axis")
-
-node = onnx.helper.make_node(
-    "DFT", inputs=["x", "", "axis"], outputs=["y"], inverse=1
-)
-x = np.arange(0, 100, dtype=np.complex64).reshape(10, 10)
-axis = np.array(1, dtype=np.int64)
-y = np.fft.ifft(x, axis=0)
-
-x = np.stack((x.real, x.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(node, inputs=[x, axis], outputs=[y], name="test_dft_inverse")
-```
-
-</details>
-
-
-<details>
-<summary>opset19</summary>
-
 ```python
 node = onnx.helper.make_node("DFT", inputs=["x"], outputs=["y"], axis=1)
 x = np.arange(0, 100).reshape(10, 10).astype(np.float32)
@@ -6828,13 +6478,7 @@ y = np.fft.fft(x, axis=0)
 
 x = x.reshape(1, 10, 10, 1)
 y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_dft_opset19",
-    opset_imports=[onnx.helper.make_opsetid("", 19)],
-)
+expect(node, inputs=[x], outputs=[y], name="test_dft")
 
 node = onnx.helper.make_node("DFT", inputs=["x"], outputs=["y"], axis=2)
 x = np.arange(0, 100).reshape(10, 10).astype(np.float32)
@@ -6842,13 +6486,7 @@ y = np.fft.fft(x, axis=1)
 
 x = x.reshape(1, 10, 10, 1)
 y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_dft_axis_opset19",
-    opset_imports=[onnx.helper.make_opsetid("", 19)],
-)
+expect(node, inputs=[x], outputs=[y], name="test_dft_axis")
 
 node = onnx.helper.make_node(
     "DFT", inputs=["x"], outputs=["y"], inverse=1, axis=1
@@ -6861,13 +6499,7 @@ y = np.fft.ifft(x, axis=0)
 
 x = np.stack((x.real, x.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
 y = np.stack((y.real, y.imag), axis=2).astype(np.float32).reshape(1, 10, 10, 2)
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_dft_inverse_opset19",
-    opset_imports=[onnx.helper.make_opsetid("", 19)],
-)
+expect(node, inputs=[x], outputs=[y], name="test_dft_inverse")
 ```
 
 </details>
@@ -7318,7 +6950,7 @@ Other versions of this operator: <a href="Changelog.md#DequantizeLinear-10">10</
 <dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
 <dd>Constrain 'x_zero_point' and 'x' to 8-bit integer or float, or /32-bit integer tensor.</dd>
 <dt><tt>T2</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
-<dd>'x_scale' determines the output type.</dd>
+<dd>'y_scale' determines the output type.</dd>
 </dl>
 
 
@@ -7397,13 +7029,12 @@ node = onnx.helper.make_node(
     "DequantizeLinear",
     inputs=["x", "x_scale"],
     outputs=["y"],
-    axis=0,
 )
 
 # scalar zero point and scale
-x = make_tensor("x", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, -104])
+x = make_tensor("x", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 104])
 x_scale = np.float32(2)
-y = np.array([0.0, 1.0, 2.0, 896.0, -208.0], dtype=np.float32)
+y = np.array([0.0, 1.0, 2.0, 896.0, 208.0], dtype=np.float32)
 
 expect(
     node,
@@ -7416,61 +7047,6 @@ expect(
 </details>
 
 
-<details>
-<summary>e4m3fn_float16</summary>
-
-```python
-node = onnx.helper.make_node(
-    "DequantizeLinear",
-    inputs=["x", "x_scale"],
-    outputs=["y"],
-    axis=0,
-)
-
-# scalar zero point and scale
-x = make_tensor("x", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, -104])
-x_scale = np.float16(2)
-y = np.array([0.0, 1.0, 2.0, 896.0, -208.0], dtype=np.float16)
-
-expect(
-    node,
-    inputs=[x, x_scale],
-    outputs=[y],
-    name="test_dequantizelinear_e4m3fn_float16",
-)
-```
-
-</details>
-
-
-<details>
-<summary>e4m3fn_zero_point</summary>
-
-```python
-node = onnx.helper.make_node(
-    "DequantizeLinear",
-    inputs=["x", "x_scale", "zero_point"],
-    outputs=["y"],
-    axis=0,
-)
-
-# scalar zero point and scale
-x = make_tensor("x", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, -104])
-zero_point = make_tensor("zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-x_scale = np.float32(2)
-y = np.array([0.0, 1.0, 2.0, 896.0, -208.0], dtype=np.float32)
-
-expect(
-    node,
-    inputs=[x, x_scale, zero_point],
-    outputs=[y],
-    name="test_dequantizelinear_e4m3fn_zero_point",
-)
-```
-
-</details>
-
-
 <details>
 <summary>e5m2</summary>
 
@@ -7479,13 +7055,12 @@ node = onnx.helper.make_node(
     "DequantizeLinear",
     inputs=["x", "x_scale"],
     outputs=["y"],
-    axis=0,
 )
 
 # scalar zero point and scale
-x = make_tensor("x", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, -96])
+x = make_tensor("x", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 x_scale = np.float32(2)
-y = np.array([0.0, 1.0, 2.0, 98304.0, -192.0], dtype=np.float32)
+y = np.array([0.0, 1.0, 2.0, 98304.0, 192.0], dtype=np.float32)
 
 expect(
     node,
@@ -7973,14 +7548,14 @@ expect(
 
 ### <a name="DynamicQuantizeLinear"></a><a name="dynamicquantizelinear">**DynamicQuantizeLinear**</a>
 
-  A Function to fuse calculation for Scale, Zero Point and FP32->8Bit conversion of FP32 Input data.
+  A Function to fuse calculation for Scale, Zero Point and FP32->8Bit convertion of FP32 Input data.
   Outputs Scale, ZeroPoint and Quantized Input for a given FP32 Input.
   Scale is calculated as:
   ```
-  y_scale = (maximum(0, max(x)) - minimum(0, min(x))) / (qmax - qmin)
+  y_scale = (max(x) - min(x))/(qmax - qmin)
   ```
 
-  * where qmax and qmin are max and min values for quantization range i.e. [0, 255] in case of uint8
+  * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
   * data range is adjusted to include 0.
 
   Zero point is calculated as:
@@ -8104,7 +7679,7 @@ expect(
   An einsum of the form `term1, term2 -> output-term` produces an output tensor using the following equation
 
   ```
-  output[output-term] = reduce-sum( input1[term1] * input2[term2] )
+  output[output-term] = reduce-sum( input1[term1] * input2[term] )
   ```
 
   where the reduce-sum performs a summation over all the indices occurring in the input terms (term1, term2)
@@ -9661,50 +9236,57 @@ expect(
 
   This operator is the inverse of `ScatterND`.
 
-  **Example 1**
+  `Example 1`
 
-  ```
-  batch_dims = 0
-  data    = [[0,1],[2,3]]   # data_shape    = [2, 2]
-  indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
-  output  = [0,3]           # output_shape  = [2]
-  ```
+    batch_dims = 0
 
-  **Example 2**
+    data    = [[0,1],[2,3]]   # data_shape = [2, 2]
 
-  ```
-  batch_dims = 0
-  data    = [[0,1],[2,3]]  # data_shape    = [2, 2]
-  indices = [[1],[0]]      # indices_shape = [2, 1]
-  output  = [[2,3],[0,1]]  # output_shape  = [2, 2]
-  ```
+    indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
 
-  **Example 3**
+    output  = [0,3]           # output_shape = [2]
 
-  ```
-  batch_dims = 0
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
-  output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
-  ```
+  `Example 2`
 
-  **Example 4**
+    batch_dims = 0
 
-  ```
-  batch_dims = 0
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
-  output  = [[[2,3]],[[4,5]]]             # output_shape  = [2, 1, 2]
-  ```
+    data    = [[0,1],[2,3]]  # data_shape = [2, 2]
+
+    indices = [[1],[0]]      # indices_shape = [2, 1]
+
+    output  = [[2,3],[0,1]]  # output_shape = [2, 2]
+
+  `Example 3`
+
+    batch_dims = 0
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
+
+    output  = [[2,3],[4,5]]                 # output_shape = [2, 2]
+
+  `Example 4`
+
+    batch_dims = 0
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
+
+    output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2]
+
+  `Example 5`
+
+    batch_dims = 1
+
+    data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[1],[0]]             # indices_shape = [2, 1]
+
+    output  = [[2,3],[4,5]]             # output_shape = [2, 2]
 
-  **Example 5**
 
-  ```
-  batch_dims = 1
-  data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
-  indices = [[1],[0]]                     # indices_shape = [2, 1]
-  output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
-  ```
 
 #### Version
 
@@ -9824,101 +9406,6 @@ expect(
 </details>
 
 
-### <a name="Gelu"></a><a name="gelu">**Gelu**</a>
-
-  Gelu takes one input data (Tensor<T>) and produces one
-  output data (Tensor<T>) where the gaussian error linear units function,
-  $y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
-  If the attribute "approximate" is set to "tanh", the function estimation,
-  $y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
-  to the tensor elementwise.
-
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>approximate</tt> : string (default is none)</dt>
-<dd>Gelu approximation algorithm: `"tanh"`, `"none"`(default).`"none"`: do not use approximation.`"tanh"`: use tanh approximation.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (differentiable) : T</dt>
-<dd>Input tensor</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (differentiable) : T</dt>
-<dd>Output tensor</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to float tensors.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>gelu_default</summary>
-
-```python
-node = onnx.helper.make_node("Gelu", inputs=["x"], outputs=["y"])
-
-x = np.array([-1, 0, 1]).astype(np.float32)
-# expected output [-0.15865526, 0., 0.84134474]
-y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
-expect(node, inputs=[x], outputs=[y], name="test_gelu_default_1")
-
-x = np.random.randn(3, 4, 5).astype(np.float32)
-# expected output [2.99595031, 3.99987331, 4.99999857]
-y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
-expect(node, inputs=[x], outputs=[y], name="test_gelu_default_2")
-```
-
-</details>
-
-
-<details>
-<summary>gelu_tanh</summary>
-
-```python
-node = onnx.helper.make_node(
-    "Gelu", inputs=["x"], outputs=["y"], approximate="tanh"
-)
-
-x = np.array([-1, 0, 1]).astype(np.float32)
-# expected output [-0.158808, 0., 0.841192]
-y = (
-    0.5
-    * x
-    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
-).astype(np.float32)
-expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_1")
-
-x = np.random.randn(3, 4, 5).astype(np.float32)
-# expected output [2.9963627, 3.99993, 4.9999995]
-y = (
-    0.5
-    * x
-    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
-).astype(np.float32)
-expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_2")
-```
-
-</details>
-
-
 ### <a name="Gemm"></a><a name="gemm">**Gemm**</a>
 
   General Matrix multiplication:
@@ -10529,38 +10016,31 @@ Other versions of this operator: <a href="Changelog.md#GreaterOrEqual-12">12</a>
 
 ### <a name="GridSample"></a><a name="gridsample">**GridSample**</a>
 
-  Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
-  For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
-  the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
-  the `grid` will have shape (N, D_out, H_out, W_out, 3), the output `Y` will have shape (N, C, D_out, H_out, W_out).
-  More generally, for an input `X` of rank r+2 with shape (N, C, d1, d2, ..., dr),
-  the `grid` will have shape (N, D1_out, D2_out, ..., Dr_out, r), the output `Y` will have shape (N, C, D1_out, D2_out, ..., Dr_out).
+  Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from `grid`.
+  Currently, only spatial (4-D) inputs are supported. For input `X` with shape (N, C, H, W) and `grid` with shape (N, H_out, W_out, 2),
+  the output `Y` will have shape (N, C, H_out, W_out).
 
-  The tensor `X` contains values at centers of square pixels (voxels, etc) locations such as (n, c, d1_in, d2_in, ..., dr_in).
-  The (n, d1_out, d2_out, ..., dr_out, :) values from the tensor `grid` are the normalized positions for interpolating the values
-  at the (n, c, d1_out, d2_out, ..., dr_out) locations from the output tensor `Y` using a specified interpolation method (the mode)
-  and a padding mode (for `grid` positions falling outside the 2-dimensional image).
+  The tensor `X` contains values at centers of square pixels in a H by W 2-dimensional image.
+  The tensor `grid` describes normalized positions where the output `Y` is to be computed
+  using a specified interpolation method (the mode) and a padding mode (for grid positions falling outside the 2-dimensional image).
 
-  For example, the values in `grid[n, h_out, w_out, :]` are size-2 vectors specifying normalized positions in the 2-dimensional space of `X`.
-  They are used to interpolate output values of `Y[n, c, h_out, w_out]`.
+  Elements in `grid[N, H_out, W_out]` are size-2 vectors specifying positions in the 2-dimensional space of `X`.
+  They are used to interpolate output values of `Y[N, C, H_out, W_out]`.
 
-  The GridSample operator is often used in doing grid generator and sampler in the
-  [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
-  See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
+  The GridSample operator is often used in doing grid generator and sampler in the [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
+  See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample).
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-Other versions of this operator: <a href="Changelog.md#GridSample-16">16</a>
+This version of the operator has been available since version 16 of the default ONNX operator set.
 
 #### Attributes
 
 <dl>
 <dt><tt>align_corners</tt> : int (default is 0)</dt>
-<dd>If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels (voxels, etc.). If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels (voxels, etc.), making the sampling more resolution agnostic.</dd>
-<dt><tt>mode</tt> : string (default is linear)</dt>
-<dd>Three interpolation modes: linear (default), nearest and cubic. The "linear" mode includes linear and N-linear interpolation modes depending on the number of spatial dimensions of the input tensor (i.e. linear for 1 spatial dimension, bilinear for 2 spatial dimensions, etc.). The "cubic" mode also includes N-cubic interpolation modes following the same rules. The "nearest" mode rounds to the nearest even index when the sampling point falls halfway between two indices.</dd>
+<dd>If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels. If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic.</dd>
+<dt><tt>mode</tt> : string (default is bilinear)</dt>
+<dd>Three interpolation modes: bilinear (default), nearest and bicubic.</dd>
 <dt><tt>padding_mode</tt> : string (default is zeros)</dt>
 <dd>Support padding modes for outside grid values: `zeros`(default), `border`, `reflection`. zeros: use 0 for out-of-bound grid locations, border: use border values for out-of-bound grid locations, reflection: use values at locations reflected by the border for out-of-bound grid locations. If index 0 represents the margin pixel, the reflected value at index -1 will be the same as the value at index 1. For location far away from the border, it will keep being reflected until becoming in bound. If pixel location x = -3.5 reflects by border -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' = 0.5.</dd>
 </dl>
@@ -10569,16 +10049,16 @@ Other versions of this operator: <a href="Changelog.md#GridSample-16">16</a>
 
 <dl>
 <dt><tt>X</tt> (differentiable) : T1</dt>
-<dd>Input tensor of rank r+2 that has shape (N, C, D1, D2, ..., Dr), where N is the batch size, C is the number of channels, D1, D2, ..., Dr are the spatial dimensions.</dd>
+<dd>4-D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the input data.</dd>
 <dt><tt>grid</tt> (non-differentiable) : T2</dt>
-<dd>Input offset of shape (N, D1_out, D2_out, ..., Dr_out, r), where D1_out, D2_out, ..., Dr_out are the spatial dimensions of the grid and output, and r is the number of spatial dimensions. Grid specifies the sampling locations normalized by the input spatial dimensions. Therefore, it should have most values in the range of [-1, 1]. If the grid has values outside the range of [-1, 1], the corresponding outputs will be handled as defined by padding_mode. Following computer vision convention, the coordinates in the length-r location vector are listed from the innermost tensor dimension to the outermost, the opposite of regular tensor indexing.</dd>
+<dd>Input offset, 4-D tensor of shape (N, H_out, W_out, 2), where H_out and W_out are the height and width of grid and output, Grid specifies the sampling pixel locations normalized by the input spatial dimensions. Therefore, it should have most values in the range of [-1, 1]. If grid has values outside the range of [-1, 1], the corresponding outputs will be handled as defined by padding_mode.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>Y</tt> (differentiable) : T1</dt>
-<dd>Output tensor of rank r+2 that has shape (N, C, D1_out, D2_out, ..., Dr_out) of the sampled values. For integer input types, intermediate values are computed as floating point and cast to integer at the end.</dd>
+<dd>4-D tensor of shape (N, C, H_out, W_out) of sampled values. For integer input types, intermediate values are computed as floating point and cast to integer at the end.</dd>
 </dl>
 
 #### Type Constraints
@@ -10601,7 +10081,7 @@ node = onnx.helper.make_node(
     "GridSample",
     inputs=["X", "Grid"],
     outputs=["Y"],
-    mode="linear",
+    mode="bilinear",
     padding_mode="zeros",
     align_corners=0,
 )
@@ -10732,7 +10212,7 @@ node = onnx.helper.make_node(
     "GridSample",
     inputs=["X", "Grid"],
     outputs=["Y"],
-    mode="linear",
+    mode="bilinear",
 )
 # Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
 Y_bilinear = np.array(
@@ -10752,7 +10232,7 @@ node = onnx.helper.make_node(
     "GridSample",
     inputs=["X", "Grid"],
     outputs=["Y"],
-    mode="linear",
+    mode="bilinear",
     align_corners=1,
 )
 # Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
@@ -10790,7 +10270,7 @@ node = onnx.helper.make_node(
     "GridSample",
     inputs=["X", "Grid"],
     outputs=["Y"],
-    mode="cubic",
+    mode="bicubic",
 )
 # Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
 Y_bicubic = np.array(
@@ -10801,155 +10281,6 @@ Y_bicubic = np.array(
 expect(
     node, inputs=[X, Grid], outputs=[Y_bicubic], name="test_gridsample_bicubic"
 )
-
-# ============================================================================
-# Additional tests
-# The reference output tensors were generated using PyTorch 2.0.
-Grid = np.array(
-    [
-        [
-            [[-1.0, -0.8], [-0.6, -0.5], [-0.1, -0.2], [0.7, 0.0]],
-            [[0.0, 0.4], [0.2, -0.2], [-0.3, 0.5], [-1.0, 1.0]],
-        ]
-    ],
-    dtype=np.float32,
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="nearest",
-    align_corners=0,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_nearest = np.array(
-    [[[[0.0, 0.0, 2.0, 3.0], [4.0, 3.0, 4.0, 4.0]]]],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_nearest],
-    name="test_gridsample_nearest_align_corners_0_additional_1",
-)
-
-# setting mode = 'nearest'
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="nearest",
-    align_corners=1,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_nearest = np.array(
-    [[[[0.0, 0.0, 2.0, 3.0], [2.0, 3.0, 4.0, 4.0]]]],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_nearest],
-    name="test_gridsample_nearest_align_corners_1_additional_1",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="linear",
-    align_corners=0,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bilinear = np.array(
-    [[[[0.0000, 0.4500, 1.8000, 2.4000], [3.7000, 2.1000, 3.7000, 1.0000]]]],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bilinear],
-    name="test_gridsample_bilinear_align_corners_0_additional_1",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="linear",
-    align_corners=1,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bilinear = np.array(
-    [[[[0.4000, 1.2000, 2.0500, 2.8500], [3.3000, 2.2000, 3.3500, 4.0000]]]],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bilinear],
-    name="test_gridsample_bilinear_align_corners_1_additional_1",
-)
-
-# These two new bicubic tests produces slightly higher error ~5e-5
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="cubic",
-    align_corners=0,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bicubic = np.array(
-    [
-        [
-            [
-                [-0.173250, 0.284265, 1.923106, 2.568000],
-                [5.170375, 2.284414, 4.744844, 1.046875],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bicubic],
-    name="test_gridsample_bicubic_align_corners_0_additional_1",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="cubic",
-    align_corners=1,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bicubic = np.array(
-    [
-        [
-            [
-                [0.304001, 1.128750, 2.266270, 3.144844],
-                [4.531500, 2.455360, 4.599819, 4.000000],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bicubic],
-    name="test_gridsample_bicubic_align_corners_1_additional_1",
-)
 ```
 
 </details>
@@ -11049,188 +10380,19 @@ expect(
 </details>
 
 
-<details>
-<summary>volumeetric_gridsample_mode_aligncorners</summary>
+### <a name="GroupNormalization"></a><a name="groupnormalization">**GroupNormalization**</a>
 
-```python
-X = np.array(
-    [
-        [
-            [
-                [[1.0, 2.0], [3.0, 4.0]],
-                [[5.0, 6.0], [7.0, 8.0]],
-                [[9.0, 10.0], [11.0, 12.0]],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
+  A GroupNormalization function. Carries out group normalization as described in
+  the paper https://arxiv.org/abs/1803.08494
 
-Grid = np.array(
-    [
-        [
-            [
-                [[-1.0, -1.0, -1.0], [-1.0, -0.5, 0.3]],
-                [[-0.5, -0.5, -0.5], [1.0, -0.6, -1.0]],
-                [[-0.2, -0.2, -0.2], [0.4, 0.2, 0.6]],
-                [[0.0, 0.0, 0.0], [-1.0, 0.0, 0.0]],
-            ],
-            [
-                [[0.0, 0.0, 0.0], [-1.0, 1.0, 0.0]],
-                [[-0.2, -0.2, -0.2], [1.0, 0.4, -0.2]],
-                [[0.5, 0.5, 0.5], [-1.0, -0.8, 0.8]],
-                [[1.0, 1.0, 1.0], [0.4, 0.6, -0.3]],
-            ],
-        ]
-    ],
-    dtype=np.float32,
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="nearest",
-    align_corners=0,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_nearest = np.array(
-    [
-        [
-            [
-                [[1.0, 5.0], [1.0, 0.0], [5.0, 12.0], [5.0, 5.0]],
-                [[5.0, 0.0], [5.0, 0.0], [12.0, 9.0], [0.0, 8.0]],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_nearest],
-    name="test_gridsample_volumetric_nearest_align_corners_0",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="nearest",
-    align_corners=1,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_nearest = np.array(
-    [
-        [
-            [
-                [[1.0, 5.0], [1.0, 2.0], [5.0, 12.0], [5.0, 5.0]],
-                [[5.0, 7.0], [5.0, 8.0], [12.0, 9.0], [12.0, 8.0]],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_nearest],
-    name="test_gridsample_volumetric_nearest_align_corners_1",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="linear",
-    align_corners=0,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bilinear = np.array(
-    [
-        [
-            [
-                [
-                    [0.1250, 3.4000],
-                    [2.0000, 0.4500],
-                    [4.7000, 10.9000],
-                    [6.5000, 3.0000],
-                ],
-                [
-                    [6.5000, 1.7500],
-                    [4.7000, 3.3000],
-                    [11.0000, 2.5200],
-                    [1.5000, 5.4900],
-                ],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bilinear],
-    name="test_gridsample_volumetric_bilinear_align_corners_0",
-)
-
-node = onnx.helper.make_node(
-    "GridSample",
-    inputs=["X", "Grid"],
-    outputs=["Y"],
-    mode="linear",
-    align_corners=1,
-)
-# Y shape, [N, C, H_out, W_out] - [1, 1, 2, 4]
-Y_bilinear = np.array(
-    [
-        [
-            [
-                [
-                    [1.0000, 6.7000],
-                    [3.7500, 2.4000],
-                    [5.4000, 9.3000],
-                    [6.5000, 6.0000],
-                ],
-                [
-                    [6.5000, 7.0000],
-                    [5.4000, 6.6000],
-                    [9.2500, 8.4000],
-                    [12.0000, 6.1000],
-                ],
-            ]
-        ]
-    ],
-    dtype=np.float32,
-)
-
-expect(
-    node,
-    inputs=[X, Grid],
-    outputs=[Y_bilinear],
-    name="test_gridsample_volumetric_bilinear_align_corners_1",
-)
-```
-
-</details>
-
-
-### <a name="GroupNormalization"></a><a name="groupnormalization">**GroupNormalization**</a>
-
-  A GroupNormalization function. Carries out group normalization as described in
-  the paper https://arxiv.org/abs/1803.08494
-
-  This operator transforms input according to
-  ```
-  y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
-  ```
-  where the mean and variance are computed per instance per group of channels, and
-  `scale` and `bias` should be specified for each group of channels. The number of
-  groups `num_groups` should be divisible by the number of channels so that there are
-  an equal number of channels per group.
+  This operator transforms input according to
+  ```
+  y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
+  ```
+  where the mean and variance are computed per instance per group of channels, and
+  `scale` and `bias` should be specified for each group of channels. The number of
+  groups `num_groups` should be divisible by the number of channels so that there are
+  an equal number of channels per group.
 
   When the number of groups is the same as the number of channels, this operator is
   equivalent to InstanceNormalization. When there is only one group, this operator
@@ -11383,7 +10545,9 @@ node = onnx.helper.make_node(
 size = np.int32(10)
 a0 = 25 / 46
 a1 = 1 - a0
-y = a0 - a1 * np.cos(2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / size)
+y = a0 - a1 * np.cos(
+    2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / size
+)
 expect(node, inputs=[size], outputs=[y], name="test_hammingwindow")
 
 # Test symmetric window
@@ -11394,7 +10558,7 @@ size = np.int32(10)
 a0 = 25 / 46
 a1 = 1 - a0
 y = a0 - a1 * np.cos(
-    2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
+    2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
 )
 expect(node, inputs=[size], outputs=[y], name="test_hammingwindow_symmetric")
 ```
@@ -11458,7 +10622,9 @@ node = onnx.helper.make_node(
 size = np.int32(10)
 a0 = 0.5
 a1 = 0.5
-y = a0 - a1 * np.cos(2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / size)
+y = a0 - a1 * np.cos(
+    2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / size
+)
 expect(node, inputs=[size], outputs=[y], name="test_hannwindow")
 
 # Test symmetric window
@@ -11469,7 +10635,7 @@ size = np.int32(10)
 a0 = 0.5
 a1 = 0.5
 y = a0 - a1 * np.cos(
-    2 * np.pi * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
+    2 * 3.1415 * np.arange(0, size, 1, dtype=np.float32) / (size - 1)
 )
 expect(node, inputs=[size], outputs=[y], name="test_hannwindow_symmetric")
 ```
@@ -11905,7 +11071,7 @@ Other versions of this operator: <a href="Changelog.md#If-1">1</a>, <a href="Cha
 
 <dl>
 <dt><tt>cond</tt> : B</dt>
-<dd>Condition for the if. The tensor must contain a single element.</dd>
+<dd>Condition for the if</dd>
 </dl>
 
 #### Outputs (1 - &#8734;)
@@ -12137,477 +11303,169 @@ expect(
 </details>
 
 
-### <a name="ImageDecoder"></a><a name="imagedecoder">**ImageDecoder**</a>
+### <a name="InstanceNormalization"></a><a name="instancenormalization">**InstanceNormalization**</a>
+
+  Carries out instance normalization as described in the paper
+  https://arxiv.org/abs/1607.08022.
+
+  y = scale * (x - mean) / sqrt(variance + epsilon) + B,
+  where mean and variance are computed per instance per channel.
 
-  Loads and decodes and image from a file. If it can't decode for any reason (e.g. corrupted encoded
-  stream, invalid format, it will return an empty matrix).
-  The following image formats are supported:
-  * BMP
-  * JPEG (note: Lossless JPEG support is optional)
-  * JPEG2000
-  * TIFF
-  * PNG
-  * WebP
-  * Portable image format (PBM, PGM, PPM, PXM, PNM)
-  Decoded images follow a channel-last layout: (Height, Width, Channels).
-  **JPEG chroma upsampling method:**
-  When upsampling the chroma components by a factor of 2, the pixels are linearly interpolated so that the
-  centers of the output pixels are 1/4 and 3/4 of the way between input pixel centers.
-  When rounding, 0.5 is rounded down and up at alternative pixels locations to prevent bias towards
-  larger values (ordered dither pattern).
-  Considering adjacent input pixels A, B, and C, B is upsampled to pixels B0 and B1 so that
-  ```
-  B0 = round_half_down((1/4) * A + (3/4) * B)
-  B1 = round_half_up((3/4) * B + (1/4) * C)
-  ```
-  This method,  is the default chroma upsampling method in the well-established libjpeg-turbo library,
-  also referred as "smooth" or "fancy" upsampling.
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 6 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#InstanceNormalization-1">1</a>
 
 #### Attributes
 
 <dl>
-<dt><tt>pixel_format</tt> : string (default is RGB)</dt>
-<dd>Pixel format. Can be one of "RGB", "BGR", or "Grayscale".</dd>
+<dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
+<dd>The epsilon value to use to avoid division by zero.</dd>
 </dl>
 
 #### Inputs
 
 <dl>
-<dt><tt>encoded_stream</tt> (non-differentiable) : T1</dt>
-<dd>Encoded stream</dd>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+<dt><tt>scale</tt> (differentiable) : T</dt>
+<dd>The input 1-dimensional scale tensor of size C.</dd>
+<dt><tt>B</tt> (differentiable) : T</dt>
+<dd>The input 1-dimensional bias tensor of size C.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
-<dt><tt>image</tt> (non-differentiable) : T2</dt>
-<dd>Decoded image</dd>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The output tensor of the same shape as input.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(uint8)</dt>
-<dd>Constrain input types to 8-bit unsigned integer tensor.</dd>
-<dt><tt>T2</tt> : tensor(uint8)</dt>
-<dd>Constrain output types to 8-bit unsigned integer tensor.</dd>
+<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
 
 #### Examples
 
 <details>
-<summary>image_decoder_decode_bmp_rgb</summary>
+<summary>instancenormalization</summary>
 
 ```python
+def _instancenorm_test_mode(x, s, bias, epsilon=1e-5):  # type: ignore
+    dims_x = len(x.shape)
+    axis = tuple(range(2, dims_x))
+    mean = np.mean(x, axis=axis, keepdims=True)
+    var = np.var(x, axis=axis, keepdims=True)
+    dim_ones = (1,) * (dims_x - 2)
+    s = s.reshape(-1, *dim_ones)
+    bias = bias.reshape(-1, *dim_ones)
+    return s * (x - mean) / np.sqrt(var + epsilon) + bias
+
+# input size: (1, 2, 1, 3)
+x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
+s = np.array([1.0, 1.5]).astype(np.float32)
+bias = np.array([0, 1]).astype(np.float32)
+y = _instancenorm_test_mode(x, s, bias).astype(np.float32)
+
 node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
+    "InstanceNormalization",
+    inputs=["x", "s", "bias"],
+    outputs=["y"],
 )
 
-data, output = _generate_test_data(
-    "bmp", _image_decoder_data.image_decoder_decode_bmp_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_bmp_rgb",
+# output size: (1, 2, 1, 3)
+expect(node, inputs=[x, s, bias], outputs=[y], name="test_instancenorm_example")
+
+# input size: (2, 3, 4, 5)
+x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+s = np.random.randn(3).astype(np.float32)
+bias = np.random.randn(3).astype(np.float32)
+epsilon = 1e-2
+y = _instancenorm_test_mode(x, s, bias, epsilon).astype(np.float32)
+
+node = onnx.helper.make_node(
+    "InstanceNormalization",
+    inputs=["x", "s", "bias"],
+    outputs=["y"],
+    epsilon=epsilon,
 )
+
+# output size: (2, 3, 4, 5)
+expect(node, inputs=[x, s, bias], outputs=[y], name="test_instancenorm_epsilon")
 ```
 
 </details>
 
 
-<details>
-<summary>image_decoder_decode_jpeg2k_rgb</summary>
+### <a name="IsInf"></a><a name="isinf">**IsInf**</a>
 
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
+  Map infinity to true and other values to false.
 
-data, output = _generate_test_data(
-    "jpeg2000", _image_decoder_data.image_decoder_decode_jpeg2k_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_jpeg2k_rgb",
-)
-```
+#### Version
 
-</details>
+This version of the operator has been available since version 10 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>detect_negative</tt> : int (default is 1)</dt>
+<dd>(Optional) Whether map negative infinity to true. Default to 1 so that negative infinity induces true. Set this attribute to 0 if negative infinity should be mapped to false.</dd>
+<dt><tt>detect_positive</tt> : int (default is 1)</dt>
+<dd>(Optional) Whether map positive infinity to true. Default to 1 so that positive infinity induces true. Set this attribute to 0 if positive infinity should be mapped to false.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (non-differentiable) : T1</dt>
+<dd>input</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (non-differentiable) : T2</dt>
+<dd>output</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float), tensor(double)</dt>
+<dd>Constrain input types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(bool)</dt>
+<dd>Constrain output types to boolean tensors.</dd>
+</dl>
 
 
+#### Examples
+
 <details>
-<summary>image_decoder_decode_jpeg_bgr</summary>
+<summary>infinity</summary>
 
 ```python
 node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="BGR",
+    "IsInf",
+    inputs=["x"],
+    outputs=["y"],
 )
 
-data, output = _generate_test_data(
-    "jpeg", _image_decoder_data.image_decoder_decode_jpeg_bgr, "BGR"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_jpeg_bgr",
-)
+x = np.array([-1.2, np.nan, np.inf, 2.8, np.NINF, np.inf], dtype=np.float32)
+y = np.isinf(x)
+expect(node, inputs=[x], outputs=[y], name="test_isinf")
 ```
 
 </details>
 
 
 <details>
-<summary>image_decoder_decode_jpeg_grayscale</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="Grayscale",
-)
-
-data, output = _generate_test_data(
-    "jpeg", _image_decoder_data.image_decoder_decode_jpeg_grayscale, "Grayscale"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_jpeg_grayscale",
-)
-```
-
-</details>
-
-
-<details>
-<summary>image_decoder_decode_jpeg_rgb</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
-
-data, output = _generate_test_data(
-    "jpeg", _image_decoder_data.image_decoder_decode_jpeg_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_jpeg_rgb",
-)
-```
-
-</details>
-
-
-<details>
-<summary>image_decoder_decode_png_rgb</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
-
-data, output = _generate_test_data(
-    "png", _image_decoder_data.image_decoder_decode_png_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_png_rgb",
-)
-```
-
-</details>
-
-
-<details>
-<summary>image_decoder_decode_pnm_rgb</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
-
-data, output = _generate_test_data(
-    "ppm", _image_decoder_data.image_decoder_decode_pnm_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_pnm_rgb",
-)
-```
-
-</details>
-
-
-<details>
-<summary>image_decoder_decode_tiff_rgb</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
-
-data, output = _generate_test_data(
-    "tiff", _image_decoder_data.image_decoder_decode_tiff_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_tiff_rgb",
-)
-```
-
-</details>
-
-
-<details>
-<summary>image_decoder_decode_webp_rgb</summary>
-
-```python
-node = onnx.helper.make_node(
-    "ImageDecoder",
-    inputs=["data"],
-    outputs=["output"],
-    pixel_format="RGB",
-)
-
-data, output = _generate_test_data(
-    "webp", _image_decoder_data.image_decoder_decode_webp_rgb, "RGB"
-)
-expect(
-    node,
-    inputs=[data],
-    outputs=[output],
-    name="test_image_decoder_decode_webp_rgb",
-)
-```
-
-</details>
-
-
-### <a name="InstanceNormalization"></a><a name="instancenormalization">**InstanceNormalization**</a>
-
-  Carries out instance normalization as described in the paper
-  https://arxiv.org/abs/1607.08022.
-
-  y = scale * (x - mean) / sqrt(variance + epsilon) + B,
-  where mean and variance are computed per instance per channel.
-
-
-#### Version
-
-This version of the operator has been available since version 6 of the default ONNX operator set.
-
-Other versions of this operator: <a href="Changelog.md#InstanceNormalization-1">1</a>
-
-#### Attributes
-
-<dl>
-<dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
-<dd>The epsilon value to use to avoid division by zero.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>input</tt> (differentiable) : T</dt>
-<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
-<dt><tt>scale</tt> (differentiable) : T</dt>
-<dd>The input 1-dimensional scale tensor of size C.</dd>
-<dt><tt>B</tt> (differentiable) : T</dt>
-<dd>The input 1-dimensional bias tensor of size C.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>output</tt> (differentiable) : T</dt>
-<dd>The output tensor of the same shape as input.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
-<dd>Constrain input and output types to float tensors.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>instancenormalization</summary>
-
-```python
-def _instancenorm_test_mode(x, s, bias, epsilon=1e-5):  # type: ignore
-    dims_x = len(x.shape)
-    axis = tuple(range(2, dims_x))
-    mean = np.mean(x, axis=axis, keepdims=True)
-    var = np.var(x, axis=axis, keepdims=True)
-    dim_ones = (1,) * (dims_x - 2)
-    s = s.reshape(-1, *dim_ones)
-    bias = bias.reshape(-1, *dim_ones)
-    return s * (x - mean) / np.sqrt(var + epsilon) + bias
-
-# input size: (1, 2, 1, 3)
-x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-s = np.array([1.0, 1.5]).astype(np.float32)
-bias = np.array([0, 1]).astype(np.float32)
-y = _instancenorm_test_mode(x, s, bias).astype(np.float32)
-
-node = onnx.helper.make_node(
-    "InstanceNormalization",
-    inputs=["x", "s", "bias"],
-    outputs=["y"],
-)
-
-# output size: (1, 2, 1, 3)
-expect(node, inputs=[x, s, bias], outputs=[y], name="test_instancenorm_example")
-
-# input size: (2, 3, 4, 5)
-x = np.random.randn(2, 3, 4, 5).astype(np.float32)
-s = np.random.randn(3).astype(np.float32)
-bias = np.random.randn(3).astype(np.float32)
-epsilon = 1e-2
-y = _instancenorm_test_mode(x, s, bias, epsilon).astype(np.float32)
-
-node = onnx.helper.make_node(
-    "InstanceNormalization",
-    inputs=["x", "s", "bias"],
-    outputs=["y"],
-    epsilon=epsilon,
-)
-
-# output size: (2, 3, 4, 5)
-expect(node, inputs=[x, s, bias], outputs=[y], name="test_instancenorm_epsilon")
-```
-
-</details>
-
-
-### <a name="IsInf"></a><a name="isinf">**IsInf**</a>
-
-  Map infinity to true and other values to false.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-Other versions of this operator: <a href="Changelog.md#IsInf-10">10</a>
-
-#### Attributes
-
-<dl>
-<dt><tt>detect_negative</tt> : int (default is 1)</dt>
-<dd>(Optional) Whether map negative infinity to true. Default to 1 so that negative infinity induces true. Set this attribute to 0 if negative infinity should be mapped to false.</dd>
-<dt><tt>detect_positive</tt> : int (default is 1)</dt>
-<dd>(Optional) Whether map positive infinity to true. Default to 1 so that positive infinity induces true. Set this attribute to 0 if positive infinity should be mapped to false.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>input</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>output</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>Constrain input types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(bool)</dt>
-<dd>Constrain output types to boolean tensors.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>infinity</summary>
-
-```python
-node = onnx.helper.make_node(
-    "IsInf",
-    inputs=["x"],
-    outputs=["y"],
-)
-
-x = np.array([-1.2, np.nan, np.inf, 2.8, np.NINF, np.inf], dtype=np.float32)
-y = np.isinf(x)
-expect(node, inputs=[x], outputs=[y], name="test_isinf")
-```
-
-</details>
-
-
-<details>
-<summary>infinity_float16</summary>
-
-```python
-node = onnx.helper.make_node(
-    "IsInf",
-    inputs=["x"],
-    outputs=["y"],
-)
-
-x = np.array([-1.2, np.nan, np.inf, 2.8, np.NINF, np.inf], dtype=np.float16)
-y = np.isinf(x)
-expect(node, inputs=[x], outputs=[y], name="test_isinf_float16")
-```
-
-</details>
-
-
-<details>
-<summary>negative_infinity_only</summary>
+<summary>negative_infinity_only</summary>
 
 ```python
 node = onnx.helper.make_node(
@@ -12644,9 +11502,9 @@ expect(node, inputs=[x], outputs=[y], name="test_isinf_positive")
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 13 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#IsNaN-9">9</a>, <a href="Changelog.md#IsNaN-13">13</a>
+Other versions of this operator: <a href="Changelog.md#IsNaN-9">9</a>
 
 #### Inputs
 
@@ -12665,7 +11523,7 @@ Other versions of this operator: <a href="Changelog.md#IsNaN-9">9</a>, <a href="
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(bool)</dt>
 <dd>Constrain output types to boolean tensors.</dd>
@@ -12674,24 +11532,6 @@ Other versions of this operator: <a href="Changelog.md#IsNaN-9">9</a>, <a href="
 
 #### Examples
 
-<details>
-<summary>float16</summary>
-
-```python
-node = onnx.helper.make_node(
-    "IsNaN",
-    inputs=["x"],
-    outputs=["y"],
-)
-
-x = np.array([-1.2, np.nan, np.inf, 2.8, np.NINF, np.inf], dtype=np.float16)
-y = np.isnan(x)
-expect(node, inputs=[x], outputs=[y], name="test_isnan_float16")
-```
-
-</details>
-
-
 <details>
 <summary>isnan</summary>
 
@@ -12702,7 +11542,7 @@ node = onnx.helper.make_node(
     outputs=["y"],
 )
 
-x = np.array([-1.2, np.nan, np.inf, 2.8, np.NINF, np.inf], dtype=np.float32)
+x = np.array([3.0, np.nan, 4.0, np.nan], dtype=np.float32)
 y = np.isnan(x)
 expect(node, inputs=[x], outputs=[y], name="test_isnan")
 ```
@@ -13182,7 +12022,7 @@ This version of the operator has been available since version 17 of the default
 
 <dl>
 <dt><tt>axis</tt> : int (default is -1)</dt>
-<dd>The first normalization dimension. If rank(X) is r, axis' allowed range is [-r, r). Negative value means counting dimensions from the back.</dd>
+<dd>The first normalization dimension. If rank(X) is r, axis' allowed range is [-r, r]. Negative value means counting dimensions from the back.</dd>
 <dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
 <dd>The epsilon value to use to avoid division by zero.</dd>
 <dt><tt>stash_type</tt> : int (default is 1)</dt>
@@ -14560,12 +13400,9 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", p=p)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0], "LPPOOL", p=p)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_1d_default")
 ```
@@ -14591,14 +13428,13 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_2d_default")
 ```
@@ -14677,9 +13513,9 @@ x_shape = np.shape(x)
 kernel_shape = (3, 3)
 strides = (1, 1)
 pad_bottom = pad_top = pad_right = pad_left = 2
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-out_shape, pads = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
+pad_shape = [pad_top + pad_bottom, pad_left + pad_right]
+out_shape = get_output_shape(
+    "VALID", np.add(x_shape[2:], pad_shape), kernel_shape, strides
 )
 padded = np.pad(
     x,
@@ -14687,7 +13523,9 @@ padded = np.pad(
     mode="constant",
     constant_values=0,
 )
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", pads, p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_2d_pads")
 ```
@@ -14717,9 +13555,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_LOWER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_LOWER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_LOWER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -14733,8 +13569,9 @@ padded = np.pad(
     mode="constant",
     constant_values=0,
 )
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", pads, p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_2d_same_lower")
 ```
@@ -14764,9 +13601,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_UPPER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_UPPER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_UPPER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -14780,8 +13615,9 @@ padded = np.pad(
     mode="constant",
     constant_values=0,
 )
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", pads, p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_2d_same_upper")
 ```
@@ -14808,14 +13644,13 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = (5, 5)
 strides = (3, 3)
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_2d_strides")
 ```
@@ -14841,14 +13676,13 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = [2, 2, 2]
 strides = [1, 1, 1]
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "LPPOOL", p=p)
+y = pool(
+    padded, x_shape, kernel_shape, strides, out_shape, [0, 0, 0], "LPPOOL", p=p
+)
 
 expect(node, inputs=[x], outputs=[y], name="test_lppool_3d_default")
 ```
@@ -15128,28 +13962,21 @@ for op_dtype in all_numeric_dtypes:
    the tensor according to kernel sizes, stride sizes, and pad lengths.
    max pooling consisting of computing the max on all values of a
    subset of the input tensor according to the kernel size and downsampling the
-   data into the output tensor Y for further processing. The output spatial shape is calculated differently
-   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   data into the output tensor Y for further processing. The output spatial shape will be following:
    ```
-   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
    or
    ```
-   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+   if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
 
-   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
    ```
    VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
    SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
    ```
-   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
-   ```
-   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
-   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
-   ```
    And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
    ```
    pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -15226,14 +14053,11 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = [2]
 strides = [1]
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0], "MAX")
 
 expect(node, inputs=[x], outputs=[y], name="test_maxpool_1d_default")
 ```
@@ -15293,14 +14117,11 @@ node = onnx.helper.make_node(
 )
 x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
-pads = None
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "MAX")
 
 expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_default")
 ```
@@ -15365,9 +14186,9 @@ x_shape = np.shape(x)
 kernel_shape = (3, 3)
 strides = (1, 1)
 pad_bottom = pad_top = pad_right = pad_left = 2
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-out_shape, pads = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
+pad_shape = [pad_top + pad_bottom, pad_left + pad_right]
+out_shape = get_output_shape(
+    "VALID", np.add(x_shape[2:], pad_shape), kernel_shape, strides
 )
 padded = np.pad(
     x,
@@ -15375,8 +14196,7 @@ padded = np.pad(
     mode="constant",
     constant_values=np.nan,
 )
-
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "MAX")
 
 expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_pads")
 ```
@@ -15527,9 +14347,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_LOWER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_LOWER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_LOWER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -15543,8 +14361,7 @@ padded = np.pad(
     mode="constant",
     constant_values=np.nan,
 )
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "MAX")
 
 expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_same_lower")
 ```
@@ -15572,9 +14389,7 @@ x = np.random.randn(1, 3, 32, 32).astype(np.float32)
 x_shape = np.shape(x)
 kernel_shape = (2, 2)
 strides = (1, 1)
-out_shape = get_output_shape_auto_pad(
-    "SAME_UPPER", x_shape[2:], kernel_shape, strides
-)
+out_shape = get_output_shape("SAME_UPPER", x_shape[2:], kernel_shape, strides)
 pad_shape = get_pad_shape(
     "SAME_UPPER", x_shape[2:], kernel_shape, strides, out_shape
 )
@@ -15588,8 +14403,7 @@ padded = np.pad(
     mode="constant",
     constant_values=np.nan,
 )
-pads = [pad_top, pad_left, pad_bottom, pad_right]
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX", pads)
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, pad_shape, "MAX")
 
 expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_same_upper")
 ```
@@ -15609,290 +14423,91 @@ node = onnx.helper.make_node(
     "MaxPool", inputs=["x"], outputs=["y"], kernel_shape=[5, 5], strides=[3, 3]
 )
 x = np.random.randn(1, 3, 32, 32).astype(np.float32)
-x_shape = np.shape(x)
-pads = None
-kernel_shape = (5, 5)
-strides = (3, 3)
-out_shape, pads = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
-padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX")
-
-expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_strides")
-```
-
-</details>
-
-
-<details>
-<summary>maxpool_2d_uint8</summary>
-
-```python
-"""
-input_shape: [1, 1, 5, 5]
-output_shape: [1, 1, 5, 5]
-pad_shape: [4, 4] -> [2, 2, 2, 2] by axis
-"""
-node = onnx.helper.make_node(
-    "MaxPool",
-    inputs=["x"],
-    outputs=["y"],
-    kernel_shape=[5, 5],
-    pads=[2, 2, 2, 2],
-)
-x = np.array(
-    [
-        [
-            [
-                [1, 2, 3, 4, 5],
-                [6, 7, 8, 9, 10],
-                [11, 12, 13, 14, 15],
-                [16, 17, 18, 19, 20],
-                [21, 22, 23, 24, 25],
-            ]
-        ]
-    ]
-).astype(np.uint8)
-y = np.array(
-    [
-        [
-            [
-                [13, 14, 15, 15, 15],
-                [18, 19, 20, 20, 20],
-                [23, 24, 25, 25, 25],
-                [23, 24, 25, 25, 25],
-                [23, 24, 25, 25, 25],
-            ]
-        ]
-    ]
-).astype(np.uint8)
-
-expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_uint8")
-```
-
-</details>
-
-
-<details>
-<summary>maxpool_3d_default</summary>
-
-```python
-"""
-input_shape: [1, 3, 32, 32, 32]
-output_shape: [1, 3, 31, 31, 31]
-"""
-node = onnx.helper.make_node(
-    "MaxPool",
-    inputs=["x"],
-    outputs=["y"],
-    kernel_shape=[2, 2, 2],
-)
-x = np.random.randn(1, 3, 32, 32, 32).astype(np.float32)
-x_shape = np.shape(x)
-pads = None
-kernel_shape = [2, 2, 2]
-strides = [1, 1, 1]
-out_shape, _ = get_output_shape_explicit_padding(
-    pads, x_shape[2:], kernel_shape, strides
-)
+x_shape = np.shape(x)
+kernel_shape = (5, 5)
+strides = (3, 3)
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
 padded = x
-y = pool(padded, x_shape, kernel_shape, strides, out_shape, "MAX")
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, (0, 0), "MAX")
 
-expect(node, inputs=[x], outputs=[y], name="test_maxpool_3d_default")
+expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_strides")
 ```
 
 </details>
 
 
 <details>
-<summary>maxpool_3d_dilations</summary>
+<summary>maxpool_2d_uint8</summary>
 
 ```python
 """
-input_shape: [1, 1, 4, 4, 4]
-output_shape: [1, 1, 2, 2, 2]
+input_shape: [1, 1, 5, 5]
+output_shape: [1, 1, 5, 5]
+pad_shape: [4, 4] -> [2, 2, 2, 2] by axis
 """
 node = onnx.helper.make_node(
     "MaxPool",
     inputs=["x"],
     outputs=["y"],
-    kernel_shape=[2, 2, 2],
-    strides=[1, 1, 1],
-    dilations=[2, 2, 2],
+    kernel_shape=[5, 5],
+    pads=[2, 2, 2, 2],
 )
 x = np.array(
     [
         [
             [
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
+                [1, 2, 3, 4, 5],
+                [6, 7, 8, 9, 10],
+                [11, 12, 13, 14, 15],
+                [16, 17, 18, 19, 20],
+                [21, 22, 23, 24, 25],
             ]
         ]
     ]
-).astype(np.float32)
-y = np.array([[[[[11, 12], [15, 16]], [[11, 12], [15, 16]]]]]).astype(
-    np.float32
-)
-
-expect(node, inputs=[x], outputs=[y], name="test_maxpool_3d_dilations")
-```
-
-</details>
-
-
-<details>
-<summary>maxpool_3d_dilations_use_ref_impl</summary>
-
-```python
-"""
-input_shape: [1, 1, 4, 4, 4]
-output_shape: [1, 1, 2, 2, 2]
-"""
-dilations = [2, 2, 2]
-kernel_shape = [2, 2, 2]
-strides = [1, 1, 1]
-ceil_mode = False
-node = onnx.helper.make_node(
-    "MaxPool",
-    inputs=["x"],
-    outputs=["y"],
-    kernel_shape=[2, 2, 2],
-    strides=[1, 1, 1],
-    dilations=dilations,
-)
-x = np.array(
+).astype(np.uint8)
+y = np.array(
     [
         [
             [
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
-                [
-                    [1, 2, 3, 4],
-                    [5, 6, 7, 8],
-                    [9, 10, 11, 12],
-                    [13, 14, 15, 16],
-                ],
+                [13, 14, 15, 15, 15],
+                [18, 19, 20, 20, 20],
+                [23, 24, 25, 25, 25],
+                [23, 24, 25, 25, 25],
+                [23, 24, 25, 25, 25],
             ]
         ]
     ]
-).astype(np.float32)
-
-x_shape = x.shape[2:]
-out_shape, pads = get_output_shape_explicit_padding(
-    None, x_shape, kernel_shape, strides, dilations, ceil_mode=ceil_mode
-)
-padded = x
-y = pool(
-    padded,
-    (1, 1, *x_shape),
-    kernel_shape,
-    strides,
-    out_shape,
-    "MAX",
-    pads,
-    dilations=dilations,
-)
+).astype(np.uint8)
 
-expect(
-    node, inputs=[x], outputs=[y], name="test_maxpool_3d_dilations_use_ref_impl"
-)
+expect(node, inputs=[x], outputs=[y], name="test_maxpool_2d_uint8")
 ```
 
 </details>
 
 
 <details>
-<summary>maxpool_3d_dilations_use_ref_impl_large</summary>
+<summary>maxpool_3d_default</summary>
 
 ```python
-x_shape = (32, 32, 32)
-dilations = (2, 2, 2)
-kernel_shape = (5, 5, 5)
-strides = (3, 3, 3)
-ceil_mode = True
-
+"""
+input_shape: [1, 3, 32, 32, 32]
+output_shape: [1, 3, 31, 31, 31]
+"""
 node = onnx.helper.make_node(
     "MaxPool",
     inputs=["x"],
     outputs=["y"],
-    kernel_shape=kernel_shape,
-    strides=strides,
-    dilations=dilations,
-    ceil_mode=ceil_mode,
-)
-
-x = np.random.randn(1, 1, *x_shape).astype(np.float32)
-out_shape, pads = get_output_shape_explicit_padding(
-    None, x_shape, kernel_shape, strides, dilations, ceil_mode=ceil_mode
-)
-padded = np.pad(
-    x,
-    (
-        (0, 0),
-        (0, 0),
-        (pads[0], pads[3]),
-        (pads[1], pads[4]),
-        (pads[2], pads[5]),
-    ),
-    mode="constant",
-    constant_values=0,
-)
-y = pool(
-    padded,
-    (1, 1, *x_shape),
-    kernel_shape,
-    strides,
-    out_shape,
-    "MAX",
-    pads,
-    dilations=dilations,
+    kernel_shape=[2, 2, 2],
 )
+x = np.random.randn(1, 3, 32, 32, 32).astype(np.float32)
+x_shape = np.shape(x)
+kernel_shape = [2, 2, 2]
+strides = [1, 1, 1]
+out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides)
+padded = x
+y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0, 0, 0], "MAX")
 
-expect(
-    node,
-    inputs=[x],
-    outputs=[y],
-    name="test_maxpool_3d_dilations_use_ref_impl_large",
-)
+expect(node, inputs=[x], outputs=[y], name="test_maxpool_3d_default")
 ```
 
 </details>
@@ -16056,7 +14671,7 @@ This version of the operator has been available since version 1 of the default O
   MaxUnpool essentially computes the partial inverse of the MaxPool op.
    The input information to this op is typically the output information from a MaxPool op. The first
    input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
+   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
    to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
    The third (optional) input is a tensor that specifies the output size of the unpooling operation.
 
@@ -16069,7 +14684,7 @@ This version of the operator has been available since version 1 of the default O
    known/predictable size.
 
   In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-   which define the exact unpooling op. The attributes typically have the same values as the corresponding
+   which define the exact unpooling op. The attributes typically have the same values as the corrsponding
    pooling op that the unpooling op is trying to invert.
 
 #### Version
@@ -16278,7 +14893,7 @@ Other versions of this operator: <a href="Changelog.md#MeanVarianceNormalization
 
 <dl>
 <dt><tt>axes</tt> : list of ints (default is ['0', '2', '3'])</dt>
-<dd>A list of integers, along which to reduce. The default is to calculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
+<dd>A list of integers, along which to reduce. The default is to caculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.</dd>
 </dl>
 
 #### Inputs
@@ -18570,7 +17185,7 @@ Other versions of this operator: <a href="Changelog.md#OneHot-9">9</a>
 <dt><tt>indices</tt> (non-differentiable) : T1</dt>
 <dd>Input tensor containing indices. Any entries in the 'indices' input tensor with values outside the range [-depth, depth-1] will result in one-hot representation with all 'off_value' values in the output tensor.In case 'indices' is of non-integer type, the values will be casted to int64 before use.</dd>
 <dt><tt>depth</tt> (non-differentiable) : T2</dt>
-<dd>Scalar or Rank 1 tensor containing exactly one element, specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
+<dd>Scalar specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. In case 'depth' is of non-integer type, it will be casted to int64 before use.</dd>
 <dt><tt>values</tt> (non-differentiable) : T3</dt>
 <dd>Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], where 'on_value' is the value used for filling locations specified in 'indices' input tensor, and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor. </dd>
 </dl>
@@ -19126,7 +17741,7 @@ Other versions of this operator: <a href="Changelog.md#PRelu-1">1</a>, <a href="
 <dt><tt>X</tt> (differentiable) : T</dt>
 <dd>Input tensor</dd>
 <dt><tt>slope</tt> (differentiable) : T</dt>
-<dd>Slope tensor. The shape of slope can be smaller than first input X; if so, its shape must be unidirectional broadcastable to X</dd>
+<dd>Slope tensor. The shape of slope can be smaller then first input X; if so, its shape must be unidirectional broadcastable to X</dd>
 </dl>
 
 #### Outputs
@@ -19422,7 +18037,7 @@ expect(
 <summary>reflection_edge_and_wrap_pad</summary>
 
 ```python
-for mode in ("edge", "reflect", "wrap"):
+for mode in ["edge", "reflect", "wrap"]:
     node = onnx.helper.make_node(
         "Pad", inputs=["x", "pads"], outputs=["y"], mode=mode
     )
@@ -20724,13 +19339,12 @@ expect(node, inputs=[x], outputs=[y], name="test_reciprocal")
 ### <a name="ReduceL1"></a><a name="reducel1">**ReduceL1**</a>
 
   Computes the L1 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -20767,7 +19381,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceL1-1">1</a>, <a hre
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -20860,36 +19474,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceL1",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-reduced = np.array(np.zeros(reduced_shape, dtype=np.float32))
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_l1_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -20983,13 +19567,12 @@ expect(
 ### <a name="ReduceL2"></a><a name="reducel2">**ReduceL2**</a>
 
   Computes the L2 norm of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21026,7 +19609,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceL2-1">1</a>, <a hre
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -21125,36 +19708,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceL2",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-reduced = np.array(np.zeros(reduced_shape, dtype=np.float32))
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_l2_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -21260,13 +19813,12 @@ expect(
 ### <a name="ReduceLogSum"></a><a name="reducelogsum">**ReduceLogSum**</a>
 
   Computes the log sum of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21303,43 +19855,12 @@ Other versions of this operator: <a href="Changelog.md#ReduceLogSum-1">1</a>, <a
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
 #### Examples
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceLogSum",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-zero = np.array(np.zeros(reduced_shape, dtype=np.float32))
-reduced = np.log(zero)  # -inf
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_log_sum_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -21428,13 +19949,12 @@ expect(
 ### <a name="ReduceLogSumExp"></a><a name="reducelogsumexp">**ReduceLogSumExp**</a>
 
   Computes the log sum exponent of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -21471,7 +19991,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceLogSumExp-1">1</a>,
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -21565,37 +20085,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceLogSumExp",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-zero = np.array(np.zeros(reduced_shape, dtype=np.float32))
-reduced = np.log(zero)  # -inf
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_log_sum_exp_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -21691,21 +20180,18 @@ expect(
 ### <a name="ReduceMax"></a><a name="reducemax">**ReduceMax**</a>
 
   Computes the max of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
-
+  valid.
 
-  If the input data type is Boolean, the comparison should consider `False < True`.
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 18 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#ReduceMax-1">1</a>, <a href="Changelog.md#ReduceMax-11">11</a>, <a href="Changelog.md#ReduceMax-12">12</a>, <a href="Changelog.md#ReduceMax-13">13</a>, <a href="Changelog.md#ReduceMax-18">18</a>
+Other versions of this operator: <a href="Changelog.md#ReduceMax-1">1</a>, <a href="Changelog.md#ReduceMax-11">11</a>, <a href="Changelog.md#ReduceMax-12">12</a>, <a href="Changelog.md#ReduceMax-13">13</a>
 
 #### Attributes
 
@@ -21735,48 +20221,13 @@ Other versions of this operator: <a href="Changelog.md#ReduceMax-1">1</a>, <a hr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8), tensor(bool)</dt>
-<dd>Constrain input and output types to numeric and Boolean tensors.</dd>
+<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 
 #### Examples
 
-<details>
-<summary>bool_inputs</summary>
-
-```python
-axes = np.array([1], dtype=np.int64)
-keepdims = 1
-
-node = onnx.helper.make_node(
-    "ReduceMax",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array(
-    [[True, True], [True, False], [False, True], [False, False]],
-)
-reduced = np.maximum.reduce(data, axis=tuple(axes), keepdims=bool(keepdims))
-# print(reduced)
-# [[True],
-#  [True],
-#  [True],
-#  [False]]
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_max_bool_inputs",
-)
-```
-
-</details>
-
-
 <details>
 <summary>default_axes_keepdims</summary>
 
@@ -21799,7 +20250,6 @@ expect(
     inputs=[data],
     outputs=[reduced],
     name="test_reduce_max_default_axes_keepdim_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -21811,7 +20261,6 @@ expect(
     inputs=[data],
     outputs=[reduced],
     name="test_reduce_max_default_axes_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -21848,7 +20297,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_do_not_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -21860,7 +20308,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_do_not_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -21897,7 +20344,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -21909,7 +20355,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -21946,7 +20391,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_negative_axes_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -21958,7 +20402,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_max_negative_axes_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -21968,13 +20411,12 @@ expect(
 ### <a name="ReduceMean"></a><a name="reducemean">**ReduceMean**</a>
 
   Computes the mean of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields undefined.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22011,7 +20453,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceMean-1">1</a>, <a h
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -22206,21 +20648,18 @@ expect(
 ### <a name="ReduceMin"></a><a name="reducemin">**ReduceMin**</a>
 
   Computes the min of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
-
-
-  If the input data type is Boolean, the comparison should consider `False < True`.
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 18 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#ReduceMin-1">1</a>, <a href="Changelog.md#ReduceMin-11">11</a>, <a href="Changelog.md#ReduceMin-12">12</a>, <a href="Changelog.md#ReduceMin-13">13</a>, <a href="Changelog.md#ReduceMin-18">18</a>
+Other versions of this operator: <a href="Changelog.md#ReduceMin-1">1</a>, <a href="Changelog.md#ReduceMin-11">11</a>, <a href="Changelog.md#ReduceMin-12">12</a>, <a href="Changelog.md#ReduceMin-13">13</a>
 
 #### Attributes
 
@@ -22250,48 +20689,13 @@ Other versions of this operator: <a href="Changelog.md#ReduceMin-1">1</a>, <a hr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8), tensor(bool)</dt>
-<dd>Constrain input and output types to numeric and Boolean tensors.</dd>
+<dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(int8)</dt>
+<dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
 
 #### Examples
 
-<details>
-<summary>bool_inputs</summary>
-
-```python
-axes = np.array([1], dtype=np.int64)
-keepdims = 1
-
-node = onnx.helper.make_node(
-    "ReduceMin",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array(
-    [[True, True], [True, False], [False, True], [False, False]],
-)
-reduced = np.minimum.reduce(data, axis=tuple(axes), keepdims=bool(keepdims))
-# print(reduced)
-# [[ True],
-#  [False],
-#  [False],
-#  [False]]
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_min_bool_inputs",
-)
-```
-
-</details>
-
-
 <details>
 <summary>default_axes_keepdims</summary>
 
@@ -22317,7 +20721,6 @@ expect(
     inputs=[data],
     outputs=[reduced],
     name="test_reduce_min_default_axes_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -22329,7 +20732,6 @@ expect(
     inputs=[data],
     outputs=[reduced],
     name="test_reduce_min_default_axes_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -22366,7 +20768,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_do_not_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -22378,39 +20779,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_do_not_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
-)
-```
-
-</details>
-
-
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceMin",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-one = np.array(np.ones(reduced_shape, dtype=np.float32))
-zero = np.array(np.zeros(reduced_shape, dtype=np.float32))
-reduced = one / zero  # inf
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_min_empty_set",
 )
 ```
 
@@ -22447,7 +20815,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -22459,7 +20826,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -22496,7 +20862,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_negative_axes_keepdims_example",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 
 np.random.seed(0)
@@ -22508,7 +20873,6 @@ expect(
     inputs=[data, axes],
     outputs=[reduced],
     name="test_reduce_min_negative_axes_keepdims_random",
-    opset_imports=[onnx.helper.make_opsetid("", 18)],
 )
 ```
 
@@ -22518,13 +20882,12 @@ expect(
 ### <a name="ReduceProd"></a><a name="reduceprod">**ReduceProd**</a>
 
   Computes the product of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 1.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22561,7 +20924,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceProd-1">1</a>, <a h
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -22652,36 +21015,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceProd",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-reduced = np.array(np.ones(reduced_shape, dtype=np.float32))
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_prod_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -22775,13 +21108,12 @@ expect(
 ### <a name="ReduceSum"></a><a name="reducesum">**ReduceSum**</a>
 
   Computes the sum of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
-
+  valid.
 
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -22818,7 +21150,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceSum-1">1</a>, <a hr
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -22953,37 +21285,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-"""Test case with the reduced-axis of size zero."""
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceSum",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-reduced = np.array(np.zeros(reduced_shape, dtype=np.float32))
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -23070,47 +21371,15 @@ expect(
 </details>
 
 
-<details>
-<summary>non_reduced_axis_zero</summary>
-
-```python
-"""Test case with the non-reduced-axis of size zero."""
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 0, 1]
-
-node = onnx.helper.make_node(
-    "ReduceSum",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([2], dtype=np.int64)
-reduced = np.array([], dtype=np.float32).reshape(reduced_shape)
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_empty_set_non_reduced_axis_zero",
-)
-```
-
-</details>
-
-
 ### <a name="ReduceSumSquare"></a><a name="reducesumsquare">**ReduceSumSquare**</a>
 
   Computes the sum square of the input tensor's elements along the provided axes. The resulting
-  tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+  tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
   the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-  valid. Reduction over an empty set of values yields 0.
+  valid.
 
-
-  The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
-  to `False` instead of `True`.
+  The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
+  False instead of True.
 
 #### Version
 
@@ -23147,7 +21416,7 @@ Other versions of this operator: <a href="Changelog.md#ReduceSumSquare-1">1</a>,
 
 <dl>
 <dt><tt>T</tt> : tensor(uint32), tensor(uint64), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to numeric tensors.</dd>
+<dd>Constrain input and output types to high-precision numeric tensors.</dd>
 </dl>
 
 
@@ -23243,36 +21512,6 @@ expect(
 </details>
 
 
-<details>
-<summary>empty_set</summary>
-
-```python
-shape = [2, 0, 4]
-keepdims = 1
-reduced_shape = [2, 1, 4]
-
-node = onnx.helper.make_node(
-    "ReduceSumSquare",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array([], dtype=np.float32).reshape(shape)
-axes = np.array([1], dtype=np.int64)
-reduced = np.array(np.zeros(reduced_shape, dtype=np.float32))
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_square_empty_set",
-)
-```
-
-</details>
-
-
 <details>
 <summary>keepdims</summary>
 
@@ -23304,176 +21543,61 @@ expect(
     name="test_reduce_sum_square_keepdims_example",
 )
 
-np.random.seed(0)
-data = np.random.uniform(-10, 10, shape).astype(np.float32)
-reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_square_keepdims_random",
-)
-```
-
-</details>
-
-
-<details>
-<summary>negative_axes_keepdims</summary>
-
-```python
-shape = [3, 2, 2]
-axes = np.array([-2], dtype=np.int64)
-keepdims = 1
-
-node = onnx.helper.make_node(
-    "ReduceSumSquare",
-    inputs=["data", "axes"],
-    outputs=["reduced"],
-    keepdims=keepdims,
-)
-
-data = np.array(
-    [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], dtype=np.float32
-)
-reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
-# print(reduced)
-# [[[10., 20.s]]
-# [[74., 100.]]
-# [[202., 244.]]]
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_square_negative_axes_keepdims_example",
-)
-
-np.random.seed(0)
-data = np.random.uniform(-10, 10, shape).astype(np.float32)
-reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
-
-expect(
-    node,
-    inputs=[data, axes],
-    outputs=[reduced],
-    name="test_reduce_sum_square_negative_axes_keepdims_random",
-)
-```
-
-</details>
-
-
-### <a name="RegexFullMatch"></a><a name="regexfullmatch">**RegexFullMatch**</a>
-
-  RegexFullMatch performs a full regex match on each element of the input tensor. If an element fully matches the regex pattern specified as an attribute, the corresponding element in the output is True and it is False otherwise. [RE2](https://github.com/google/re2/wiki/Syntax) regex syntax is used.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>pattern</tt> : string</dt>
-<dd>Regex pattern to match on. This must be valid RE2 syntax.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>Tensor with strings to match on.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>Tensor of bools indicating if each input string fully matches the regex pattern specified.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(string)</dt>
-<dd>Inputs must be UTF-8 strings</dd>
-<dt><tt>T2</tt> : tensor(bool)</dt>
-<dd>Outputs are bools and are True where there is a full regex match and False otherwise.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>basic</summary>
-
-```python
-node = onnx.helper.make_node(
-    "RegexFullMatch",
-    inputs=["X"],
-    outputs=["Y"],
-    pattern=r"www\.[\w.-]+\.\bcom\b",
-)
+np.random.seed(0)
+data = np.random.uniform(-10, 10, shape).astype(np.float32)
+reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
 
-x = np.array(["www.google.com", "www.facebook.com", "www.bbc.co.uk"]).astype(
-    object
+expect(
+    node,
+    inputs=[data, axes],
+    outputs=[reduced],
+    name="test_reduce_sum_square_keepdims_random",
 )
-result = np.array([True, True, False])
-expect(node, inputs=[x], outputs=[result], name="test_regex_full_match_basic")
 ```
 
 </details>
 
 
 <details>
-<summary>match_email_domain</summary>
+<summary>negative_axes_keepdims</summary>
 
 ```python
+shape = [3, 2, 2]
+axes = np.array([-2], dtype=np.int64)
+keepdims = 1
+
 node = onnx.helper.make_node(
-    "RegexFullMatch",
-    inputs=["X"],
-    outputs=["Y"],
-    pattern=r"(\W|^)[\w.\-]{0,25}@(yahoo|gmail)\.com(\W|$)",
+    "ReduceSumSquare",
+    inputs=["data", "axes"],
+    outputs=["reduced"],
+    keepdims=keepdims,
 )
 
-x = np.array(
-    [
-        ["account@gmail.com", "account@hotmail.com"],
-        ["not email", "account2@yahoo.com"],
-    ]
-).astype(object)
-result = np.array([[True, False], [False, True]])
+data = np.array(
+    [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], dtype=np.float32
+)
+reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
+# print(reduced)
+# [[[10., 20.s]]
+# [[74., 100.]]
+# [[202., 244.]]]
+
 expect(
     node,
-    inputs=[x],
-    outputs=[result],
-    name="test_regex_full_match_email_domain",
+    inputs=[data, axes],
+    outputs=[reduced],
+    name="test_reduce_sum_square_negative_axes_keepdims_example",
 )
-```
-
-</details>
-
 
-<details>
-<summary>match_empty</summary>
-
-```python
-node = onnx.helper.make_node(
-    "RegexFullMatch",
-    inputs=["X"],
-    outputs=["Y"],
-    pattern=r"(\W|^)[\w.\-]{0,25}@(yahoo|gmail)\.com(\W|$)",
-)
+np.random.seed(0)
+data = np.random.uniform(-10, 10, shape).astype(np.float32)
+reduced = np.sum(np.square(data), axis=tuple(axes), keepdims=keepdims == 1)
 
-x = np.array([[], []]).astype(object)
-result = np.array([[], []]).astype(bool)
 expect(
     node,
-    inputs=[x],
-    outputs=[result],
-    name="test_regex_full_match_empty",
+    inputs=[data, axes],
+    outputs=[reduced],
+    name="test_reduce_sum_square_negative_axes_keepdims_random",
 )
 ```
 
@@ -26269,7 +24393,7 @@ expect(
 
   Round takes one input Tensor and rounds the values, element-wise, meaning
   it finds the nearest integer for each value.
-  In case of halves, the rule is to round them to the nearest even integer.
+  In case of halfs, the rule is to round them to the nearest even integer.
   If input x is integral, +0, -0, NaN,  or infinite, x itself is returned.
   The output tensor has the same shape and type as the input.
 
@@ -26450,7 +24574,7 @@ node = onnx.helper.make_node(
 a0 = 0.5
 a1 = 0.5
 window = a0 + a1 * np.cos(
-    2 * np.pi * np.arange(0, length, 1, dtype=np.float32) / length
+    2 * 3.1415 * np.arange(0, length, 1, dtype=np.float32) / length
 )
 nstfts = 1 + (signal.shape[1] - window.shape[0]) // step
 
@@ -26911,8 +25035,8 @@ expect(
   ```
   When `reduction` is set to some reduction function `f`, the update corresponding to the [i][j] entry is performed as below:
   ```
-  output[indices[i][j]][j] = f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
-  output[i][indices[i][j]] = f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
+  output[indices[i][j]][j] += f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
+  output[i][indices[i][j]] += f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
   ```
   where the `f` is `+`, `*`, `max` or `min` as specified.
 
@@ -28756,16 +26880,16 @@ expect(node, inputs=[x], outputs=[y], name="test_size")
   Slice uses the `starts`, `ends`, `axes` and `steps` inputs to select a sub-tensor
   of its input `data` tensor.
 
-  An effective `starts[i]`, `ends[i]`, and `steps[i]` must be computed for each `i`
+  An effective `start[i]`, `end[i]`, and `step[i]` must be computed for each `i`
   in `[0, ... r-1]` where `r = rank(input)` as follows:
 
   If `axes` are omitted, they are set to `[0, ..., r-1]`.
   If `steps` are omitted, they are set to `[1, ..., 1]` of length `len(starts)`
 
-  The effective values are initialized as `start[i] = 0`, `ends[i] = dims[i]` where
-  `dims` are the dimensions of `input` and `steps[i] = 1`.
+  The effective values are initialized as `start[i] = 0`, `end[i] = dims[i]` where
+  `dims` are the dimensions of `input` and `step[i] = `1.
 
-  All negative elements of `axes` are made non-negative by adding `r` to them, where
+  All negative elements of `axes` are made non-negatve by adding `r` to them, where
   `r =rank(input)`.
 
   All negative values in `starts[i]` and `ends[i]` have `dims[axes[i]]` added to them,
@@ -28775,10 +26899,10 @@ expect(node, inputs=[x], outputs=[y], name="test_size")
 
   The clamping for the adjusted `ends[i]` depends on the sign of `steps[i]` and must
   accommodate copying 0 through `dims[axes[i]]` elements, so for positive stepping
-  `ends[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
+  `end[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
   is clamped to `[-1, dims[axes[i]]-1]`.
 
-  Finally, `steps[axes[i]] = steps[i]`.
+  Finally, `step[axes[i]] = steps[i]`.
 
   For slicing to the end of a dimension with unknown size, it is recommended to pass
   in `INT_MAX` when slicing forward and 'INT_MIN' when slicing backward.
@@ -29217,7 +27341,7 @@ expect(node, inputs=[x], outputs=[y], name="test_softmax_default_axis")
   * shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
     with K >= 1 in case of K-dimensional loss.
 
-  The loss for one sample, l_i, can calculated as follows:
+  The loss for one sample, l_i, can caculated as follows:
   ```
   l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
   ```
@@ -31324,7 +29448,7 @@ node = onnx.helper.make_node(
     keepdims=0,
 )
 
-expected_outputs = [[data[:, i] for i in range(data.shape[1])]]
+expected_outputs = [list(data[:, i] for i in range(data.shape[1]))]
 
 expect(
     node,
@@ -31530,103 +29654,6 @@ expect(node, inputs=[x, axes], outputs=[y], name="test_squeeze_negative_axes")
 </details>
 
 
-### <a name="StringConcat"></a><a name="stringconcat">**StringConcat**</a>
-
-  StringConcat concatenates string tensors elementwise (with NumPy-style broadcasting support)
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T</dt>
-<dd>Tensor to prepend in concatenation</dd>
-<dt><tt>Y</tt> (non-differentiable) : T</dt>
-<dd>Tensor to append in concatenation</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Z</tt> (non-differentiable) : T</dt>
-<dd>Concatenated string tensor</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(string)</dt>
-<dd>Inputs and outputs must be UTF-8 strings</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>stringconcat</summary>
-
-```python
-node = onnx.helper.make_node(
-    "StringConcat",
-    inputs=["x", "y"],
-    outputs=["result"],
-)
-x = np.array(["abc", "def"]).astype("object")
-y = np.array([".com", ".net"]).astype("object")
-result = np.array(["abc.com", "def.net"]).astype("object")
-
-expect(node, inputs=[x, y], outputs=[result], name="test_string_concat")
-
-x = np.array(["cat", "dog", "snake"]).astype("object")
-y = np.array(["s"]).astype("object")
-result = np.array(["cats", "dogs", "snakes"]).astype("object")
-
-expect(
-    node,
-    inputs=[x, y],
-    outputs=[result],
-    name="test_string_concat_broadcasting",
-)
-
-x = np.array("cat").astype("object")
-y = np.array("s").astype("object")
-result = np.array("cats").astype("object")
-
-expect(
-    node,
-    inputs=[x, y],
-    outputs=[result],
-    name="test_string_concat_zero_dimensional",
-)
-
-x = np.array(["abc", ""]).astype("object")
-y = np.array(["", "abc"]).astype("object")
-result = np.array(["abc", "abc"]).astype("object")
-
-expect(
-    node,
-    inputs=[x, y],
-    outputs=[result],
-    name="test_string_concat_empty_string",
-)
-
-x = np.array(["的", "中"]).astype("object")
-y = np.array(["的", "中"]).astype("object")
-result = np.array(["的的", "中中"]).astype("object")
-
-expect(
-    node,
-    inputs=[x, y],
-    outputs=[result],
-    name="test_string_concat_utf8",
-)
-```
-
-</details>
-
-
 ### <a name="StringNormalizer"></a><a name="stringnormalizer">**StringNormalizer**</a>
 
   StringNormalization performs string operations for basic cleaning.
@@ -31848,226 +29875,6 @@ expect(
 </details>
 
 
-### <a name="StringSplit"></a><a name="stringsplit">**StringSplit**</a>
-
-  StringSplit splits a string tensor's elements into substrings based on a delimiter attribute and a maxsplit attribute.
-
-  The first output of this operator is a tensor of strings representing the substrings from splitting each input string on the `delimiter` substring. This tensor has one additional rank compared to the input tensor in order to store the substrings for each input element (where the input tensor is not empty). Note that, in order to ensure the same number of elements are present in the final dimension, this tensor will pad empty strings as illustrated in the examples below. Consecutive delimiters are not grouped together and are deemed to delimit empty strings, except if the `delimiter` is unspecified or is the empty string (""). In the case where the `delimiter` is unspecified or the empty string, consecutive whitespace characters are regarded as a single separator and leading or trailing whitespace is removed in the output.
-
-  The second output tensor represents the number of substrings generated. `maxsplit` can be used to limit the number of splits performed - after the `maxsplit`th split if the string is not fully split, the trailing suffix of input string after the final split point is also added. For elements where fewer splits are possible than specified in `maxsplit`, it has no effect.
-
-#### Version
-
-This version of the operator has been available since version 20 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>delimiter</tt> : string</dt>
-<dd>Delimiter to split on. If left unset or set to the empty string (""), the input is split on consecutive whitespace.</dd>
-<dt><tt>maxsplit</tt> : int</dt>
-<dd>Maximum number of splits (from left to right). If left unset (or if the number of possible splits are less than maxsplit), it will make as many splits as possible. Note that the maximum possible number of substrings returned with `maxsplit` specified is `maxsplit+1` since the remaining suffix after the `maxsplit`th split is included in the output.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (non-differentiable) : T1</dt>
-<dd>Tensor of strings to split.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (non-differentiable) : T2</dt>
-<dd>Tensor of substrings representing the outcome of splitting the strings in the input on the delimiter. Note that to ensure the same number of elements are present in the final rank, this tensor will pad any necessary empty strings.</dd>
-<dt><tt>Z</tt> (non-differentiable) : T3</dt>
-<dd>The number of substrings generated for each input element.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T1</tt> : tensor(string)</dt>
-<dd>The input must be a UTF-8 string tensor</dd>
-<dt><tt>T2</tt> : tensor(string)</dt>
-<dd>Tensor of substrings.</dd>
-<dt><tt>T3</tt> : tensor(int64)</dt>
-<dd>The number of substrings generated.</dd>
-</dl>
-
-
-#### Examples
-
-<details>
-<summary>basic</summary>
-
-```python
-node = onnx.helper.make_node(
-    "StringSplit",
-    inputs=["x"],
-    outputs=["substrings", "length"],
-    delimiter=".",
-    maxsplit=None,
-)
-
-x = np.array(["abc.com", "def.net"]).astype(object)
-
-substrings = np.array([["abc", "com"], ["def", "net"]]).astype(object)
-
-length = np.array([2, 2], dtype=np.int64)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[substrings, length],
-    name="test_string_split_basic",
-)
-```
-
-</details>
-
-
-<details>
-<summary>consecutive_delimiters</summary>
-
-```python
-node = onnx.helper.make_node(
-    "StringSplit",
-    inputs=["x"],
-    outputs=["substrings", "length"],
-    delimiter="-",
-    maxsplit=None,
-)
-
-x = np.array(["o-n-n--x-", "o-n----nx"]).astype(object)
-
-substrings = np.array(
-    [["o", "n", "n", "", "x", ""], ["o", "n", "", "", "", "nx"]]
-).astype(object)
-
-length = np.array([6, 6], dtype=np.int64)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[substrings, length],
-    name="test_string_split_consecutive_delimiters",
-)
-```
-
-</details>
-
-
-<details>
-<summary>empty_string_delimiter</summary>
-
-```python
-for delimiter, test_name in (
-    ("", "test_string_split_empty_string_delimiter"),
-    (None, "test_string_split_no_delimiter"),
-):
-    node = onnx.helper.make_node(
-        "StringSplit",
-        inputs=["x"],
-        outputs=["substrings", "length"],
-        delimiter=delimiter,
-        maxsplit=None,
-    )
-
-    x = np.array(
-        ["hello world !", "  hello   world !", " hello world   ! "]
-    ).astype(object)
-
-    substrings = np.array(
-        [
-            ["hello", "world", "!"],
-            ["hello", "world", "!"],
-            ["hello", "world", "!"],
-        ]
-    ).astype(object)
-
-    length = np.array([3, 3, 3], dtype=np.int64)
-
-    expect(
-        node,
-        inputs=[x],
-        outputs=[substrings, length],
-        name=test_name,
-    )
-```
-
-</details>
-
-
-<details>
-<summary>empty_string_split</summary>
-
-```python
-node = onnx.helper.make_node(
-    "StringSplit",
-    inputs=["x"],
-    outputs=["substrings", "length"],
-    delimiter=None,
-    maxsplit=None,
-)
-
-x = np.array([]).astype(object)
-
-substrings = np.array([]).astype(object).reshape(0, 0)
-
-length = np.array([], dtype=np.int64)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[substrings, length],
-    name="test_string_split_empty_tensor",
-    output_type_protos=[
-        onnx.helper.make_tensor_type_proto(onnx.TensorProto.STRING, (0, None)),
-        None,
-    ],
-)
-```
-
-</details>
-
-
-<details>
-<summary>maxsplit</summary>
-
-```python
-node = onnx.helper.make_node(
-    "StringSplit",
-    inputs=["x"],
-    outputs=["substrings", "length"],
-    maxsplit=2,
-)
-
-x = np.array(
-    [["hello world", "def.net"], ["o n n x", "the quick brown fox"]]
-).astype(object)
-
-substrings = np.array(
-    [
-        [["hello", "world", ""], ["def.net", "", ""]],
-        [["o", "n", "n x"], ["the", "quick", "brown fox"]],
-    ]
-).astype(object)
-
-length = np.array([[2, 1], [3, 3]], np.int64)
-
-expect(
-    node,
-    inputs=[x],
-    outputs=[substrings, length],
-    name="test_string_split_maxsplit",
-)
-```
-
-</details>
-
-
 ### <a name="Sub"></a><a name="sub">**Sub**</a>
 
   Performs element-wise binary subtraction (with Numpy-style broadcasting support).
@@ -33713,8 +31520,8 @@ expect(node, inputs=[x, k], outputs=[y], name="test_triu_zero")
 
   This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs.
   The first output tensor 'Y' contains all unique values or subtensors of the input.
-  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurrence in 'X'.
-  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'.
+  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'..
+  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ".
   The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input.
 
   Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input.
@@ -33832,7 +31639,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>Y</tt> (non-differentiable) : T</dt>
 <dd>A tensor of the same type as 'X' containing all the unique values or subtensors sliced along a provided 'axis' in 'X', either sorted or maintained in the same order they occur in input 'X'</dd>
 <dt><tt>indices</tt> (optional, non-differentiable) : tensor(int64)</dt>
-<dd>A 1-D INT64 tensor containing indices of 'Y' elements' first occurrence in 'X'. When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. When 'axis' is not provided, it contains indices to values in the flattened input tensor. </dd>
+<dd>A 1-D INT64 tensor containing indices of 'Y' elements' first occurance in 'X'. When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. When 'axis' is not provided, it contains indices to values in the flattened input tensor. </dd>
 <dt><tt>inverse_indices</tt> (optional, non-differentiable) : tensor(int64)</dt>
 <dd>A 1-D INT64 tensor containing, for elements of 'X', its corresponding indices in 'Y'. When 'axis' is provided, it contains indices to subtensors in output 'Y' on the 'axis'. When 'axis' is not provided, it contains indices to values in output 'Y'. </dd>
 <dt><tt>counts</tt> (optional, non-differentiable) : tensor(int64)</dt>
@@ -33867,9 +31674,9 @@ y, indices, inverse_indices, counts = np.unique(x, True, True, True)
 
 # prepare index mapping from sorted to unsorted
 argsorted_indices = np.argsort(indices)
-inverse_indices_map = dict(
-    zip(argsorted_indices, np.arange(len(argsorted_indices)))
-)
+inverse_indices_map = {
+    i: si for i, si in zip(argsorted_indices, np.arange(len(argsorted_indices)))
+}
 
 indices = indices[argsorted_indices]
 y = np.take(x, indices, axis=0)
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
index eac0e15496a..0825ceb7d31 100644
--- a/onnx/defs/nn/defs.cc
+++ b/onnx/defs/nn/defs.cc
@@ -2516,7 +2516,9 @@ static const char* LayerNormalization_ver17_doc = R"DOC(
       Let `d[i]` indicate the i-th dimension of `X`.
       If `X`'s shape is `[d[0], ..., d[axis-1], d[axis], ..., d[rank-1]]`,
       the shape of `Mean` and `InvStdDev` is `[d[0], ..., d[axis-1], 1, ..., 1]`.
-      `Y` and `X` have the same shape.
+      `Y` and `X` have the same shape. This operator supports unidirectional broadcasting
+      (tensors `Scale` and `B` should be unidirectional broadcastable to tensor `X`); 
+      for more details please check [the doc](Broadcasting.md). 
 )DOC";
 
 bool BuildContextDependentFunctionBodyLayerNormalization(