diff --git a/environment.yml b/environment.yml
index 27d7c52a..75242cf9 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,6 +12,7 @@ dependencies:
   - numpydoc
   - onnx>=1.13.0
   - onnxruntime>=1.13.1
+  - pandoc  # ONNX docstrings in opset generation
   - pip
   - pre-commit
   - pytest>=6
diff --git a/src/generate.py b/src/generate.py
index 133f6767..3bf8fbe2 100644
--- a/src/generate.py
+++ b/src/generate.py
@@ -91,8 +91,8 @@
     ("If", "else_branch", ("Callable[[], Iterable[Var]]", "AttrGraph")),
 ]
 
-with importlib.resources.path("spox", ".") as path:
-    _TEMPLATE_DIR = path.parent / "templates"
+with importlib.resources.path("spox", ".") as _resource_path:
+    _TEMPLATE_DIR = _resource_path.parent / "templates"
 
 
 @dataclass
@@ -235,20 +235,74 @@ def get_constructor_return(schema: onnx.defs.OpSchema) -> str:
     return "Var"
 
 
-def format_github_markdown(doc: str) -> str:
+_PANDOC_SEP = "\U0001f6a7"  # U+1F6A7 CONSTRUCTION SIGN
+_PANDOC_GFM_TO_RST_CACHE: Dict[str, str] = {}
+
+
+def _pandoc_gfm_to_rst_run(*args: str) -> Tuple[str, ...]:
+    if not args:
+        return ()
+
+    import pandoc
+
+    sep = f"\n\n{_PANDOC_SEP}{_PANDOC_SEP}\n\n"
+    acc = sep.join([_PANDOC_SEP] + list(args) + [_PANDOC_SEP])
+    acc_results = pandoc.write(pandoc.read(acc, format="gfm"), format="rst")
+    _, *results, _ = acc_results.split(sep)
+    for arg, result in zip(args, results):
+        if _PANDOC_SEP in result:
+            raise ValueError(
+                f"Pandoc separator character '{_PANDOC_SEP}' found in a result (bad convert)."
+            )
+        _PANDOC_GFM_TO_RST_CACHE[arg] = result + "\n"
+    return results
+
+
+def _pandoc_gfm_to_rst(*args: str) -> Tuple[str, ...]:
+    args = tuple(arg.strip() for arg in args)
+    if any(_PANDOC_SEP in arg for arg in args):
+        raise ValueError(
+            f"Pandoc separator character '{_PANDOC_SEP}' cannot appear in any of the arguments."
+        )
+    valid = [
+        i
+        for i, arg in enumerate(args)
+        if not (arg in _PANDOC_GFM_TO_RST_CACHE or not arg)
+    ]
+    results = _pandoc_gfm_to_rst_run(*[args[i] for i in valid])
+    sub: List[Optional[str]] = [None] * len(args)
+    for i, result in zip(valid, results):
+        sub[i] = result
+    for i, arg in enumerate(args):
+        if not arg:
+            sub[i] = ""
+        elif arg in _PANDOC_GFM_TO_RST_CACHE:
+            sub[i] = _PANDOC_GFM_TO_RST_CACHE[arg]
+    if any(r is None for r in sub):
+        raise ValueError("Missing processed pandoc result.")
+    return tuple(sub)  # type: ignore
+
+
+def pandoc_gfm_to_rst(doc: str) -> str:
+    (result,) = _pandoc_gfm_to_rst(doc)
+    return result
+
+
+def format_github_markdown(doc: str, *, to_batch: Optional[List[str]] = None) -> str:
     """Jinja filter. Makes some attempt at fixing "Markdown" into RST."""
-    lines = [line.replace("\t", " " * 4).rstrip() for line in doc.splitlines()]
-    lines = [line for line in lines if line.rstrip()]
-    space_lcm = 0
-    while lines and all(line[: space_lcm + 1].isspace() for line in lines):
-        space_lcm += 1
-    lines = [line[space_lcm:] for line in lines]
-    doc = "\n".join(lines).strip()
-    doc = doc.replace("<br>", "\n\n")
-    doc = re.sub(r"<i>(.*)</i>", r"`\1`", doc)
-    doc = re.sub(r"<b>(.*)</b>", r"**\1**", doc)
-    doc = re.sub(r"\[(.+)\]\((.+)\)", r"\1 (\2)", doc)
-    return doc
+    # Sometimes Tensor<T> is used in the docs (~17 instances at 1.13)
+    # and is treated as invalid HTML tags by pandoc.
+    doc = doc.replace("<T>", "&lt;T&gt;")
+    # Point hyperlinks to onnx/docs
+    rel = "https://github.com/onnx/onnx/blob/main/docs"
+    doc = re.sub(
+        r"\[(.*)]\((\w+.md)\)", lambda match: f"[{match[1]}]({rel}/{match[2]})", doc
+    )
+    if to_batch is not None:
+        to_batch.append(doc)
+        return doc
+    else:
+        return pandoc_gfm_to_rst(doc).rstrip()
 
 
 def is_variadic(param: onnx.defs.OpSchema.FormalParameter) -> bool:
@@ -317,6 +371,22 @@ def write_schemas_code(
 
     built_schemas: Set[onnx.defs.OpSchema] = set()
 
+    pandoc_batch: List[str] = []
+    for schema in schemas:
+        if schema in inherited_schemas:
+            continue
+        todo = [schema.doc] + [
+            p.description
+            for p in (
+                list(schema.inputs)
+                + list(schema.outputs)
+                + list(schema.attributes.values())
+            )
+        ]
+        for doc in todo:
+            format_github_markdown(doc, to_batch=pandoc_batch)
+    _pandoc_gfm_to_rst(*pandoc_batch)
+
     # Operator classes
     for schema in sorted(schemas, key=lambda s: s.name):
         if schema in inherited_schemas:
@@ -548,6 +618,7 @@ def main(
 
 
 if __name__ == "__main__":
+    gen_all_docstrings = True
     ai_onnx_v17_schemas, ai_onnx_v17_module = main(
         "ai.onnx",
         17,
@@ -558,6 +629,7 @@ def main(
         subgraphs_solutions=V16_SUBGRAPH_SOLUTIONS,
         attr_type_overrides=DEFAULT_ATTR_TYPE_OVERRIDES,
         allow_extra_constructor_arguments=["Split"],
+        gen_docstrings=gen_all_docstrings,
     )
     ai_onnx_ml_v3_schemas, ai_onnx_ml_v3_module = main(
         "ai.onnx.ml",
@@ -575,4 +647,5 @@ def main(
             "TreeEnsembleClassifier": "treeensembleclassifier3",
             "TreeEnsembleRegressor": "treeensembleregressor3",
         },
+        gen_docstrings=gen_all_docstrings,
     )
diff --git a/src/spox/opset/ai/onnx/ml/v3.py b/src/spox/opset/ai/onnx/ml/v3.py
index 46dc54ad..6fbf64e1 100644
--- a/src/spox/opset/ai/onnx/ml/v3.py
+++ b/src/spox/opset/ai/onnx/ml/v3.py
@@ -615,10 +615,8 @@ def array_feature_extractor(
     Y: Var,
 ) -> Var:
     r"""
-    Select elements of the input tensor based on the indices passed.
-
-
-        The indices are applied to the last axes of the tensor.
+    Select elements of the input tensor based on the indices passed. The
+    indices are applied to the last axes of the tensor.
 
     Parameters
     ==========
@@ -657,7 +655,8 @@ def binarizer(
     threshold: float = 0.0,
 ) -> Var:
     r"""
-    Maps the values of the input tensor to either 0 or 1, element-wise, based on the outcome of a comparison against a threshold value.
+    Maps the values of the input tensor to either 0 or 1, element-wise,
+    based on the outcome of a comparison against a threshold value.
 
     Parameters
     ==========
@@ -699,13 +698,10 @@ def cast_map(
     max_map: int = 1,
 ) -> Var:
     r"""
-    Converts a map to a tensor.
-
-    The map key must be an int64 and the values will be ordered
-        in ascending order based on this key.
-
-    The operator supports dense packing or sparse packing.
-        If using sparse packing, the key cannot exceed the max_map-1 value.
+    Converts a map to a tensor.The map key must be an int64 and the values
+    will be ordered in ascending order based on this key.The operator
+    supports dense packing or sparse packing. If using sparse packing, the
+    key cannot exceed the max_map-1 value.
 
     Parameters
     ==========
@@ -714,21 +710,24 @@ def cast_map(
         The input map that is to be cast to a tensor
     cast_to
         Attribute.
-        A string indicating the desired element type of the output tensor, one of 'TO_FLOAT', 'TO_STRING', 'TO_INT64'.
+        A string indicating the desired element type of the output tensor, one
+        of 'TO_FLOAT', 'TO_STRING', 'TO_INT64'.
     map_form
         Attribute.
-        Indicates whether to only output as many values as are in the input (dense), or position the input based on using the key of the map as the index of the output (sparse).
-
-        One of 'DENSE', 'SPARSE'.
+        Indicates whether to only output as many values as are in the input
+        (dense), or position the input based on using the key of the map as the
+        index of the output (sparse).One of 'DENSE', 'SPARSE'.
     max_map
         Attribute.
-        If the value of map_form is 'SPARSE,' this attribute indicates the total length of the output tensor.
+        If the value of map_form is 'SPARSE,' this attribute indicates the total
+        length of the output tensor.
 
     Returns
     =======
     Y : Var
         Type T2.
-        A tensor representing the same data as the input map, ordered by their keys
+        A tensor representing the same data as the input map, ordered by their
+        keys
 
     Notes
     =====
@@ -759,20 +758,14 @@ def category_mapper(
     default_string: str = "_Unused",
 ) -> Var:
     r"""
-    Converts strings to integers and vice versa.
-
-
-        Two sequences of equal length are used to map between integers and strings,
-        with strings and integers at the same index detailing the mapping.
-
-
-        Each operator converts either integers to strings or strings to integers, depending
-        on which default value attribute is provided. Only one default value attribute
-        should be defined.
-
-
-        If the string default value is set, it will convert integers to strings.
-        If the int default value is set, it will convert strings to integers.
+    Converts strings to integers and vice versa. Two sequences of equal
+    length are used to map between integers and strings, with strings and
+    integers at the same index detailing the mapping. Each operator converts
+    either integers to strings or strings to integers, depending on which
+    default value attribute is provided. Only one default value attribute
+    should be defined. If the string default value is set, it will convert
+    integers to strings. If the int default value is set, it will convert
+    strings to integers.
 
     Parameters
     ==========
@@ -781,26 +774,27 @@ def category_mapper(
         Input data
     cats_int64s
         Attribute.
-        The integers of the map. This sequence must be the same length as the 'cats_strings' sequence.
+        The integers of the map. This sequence must be the same length as the
+        'cats_strings' sequence.
     cats_strings
         Attribute.
-        The strings of the map. This sequence must be the same length as the 'cats_int64s' sequence
+        The strings of the map. This sequence must be the same length as the
+        'cats_int64s' sequence
     default_int64
         Attribute.
-        An integer to use when an input string value is not found in the map.
-
-        One and only one of the 'default_*' attributes must be defined.
+        An integer to use when an input string value is not found in the map.One
+        and only one of the 'default_*' attributes must be defined.
     default_string
         Attribute.
-        A string to use when an input integer value is not found in the map.
-
-        One and only one of the 'default_*' attributes must be defined.
+        A string to use when an input integer value is not found in the map.One
+        and only one of the 'default_*' attributes must be defined.
 
     Returns
     =======
     Y : Var
         Type T2.
-        Output data. If strings are input, the output values are integers, and vice versa.
+        Output data. If strings are input, the output values are integers, and
+        vice versa.
 
     Notes
     =====
@@ -830,23 +824,20 @@ def dict_vectorizer(
     string_vocabulary: Optional[Iterable[str]] = None,
 ) -> Var:
     r"""
-    Uses an index mapping to convert a dictionary to an array.
-
-
-        Given a dictionary, each key is looked up in the vocabulary attribute corresponding to
-        the key type. The index into the vocabulary array at which the key is found is then
-        used to index the output 1-D tensor 'Y' and insert into it the value found in the dictionary 'X'.
-
-
-        The key type of the input map must correspond to the element type of the defined vocabulary attribute.
-        Therefore, the output array will be equal in length to the index mapping vector parameter.
-        All keys in the input dictionary must be present in the index mapping vector.
-        For each item in the input dictionary, insert its value in the output array.
-        Any keys not present in the input dictionary, will be zero in the output array.
-
-
-        For example: if the ``string_vocabulary`` parameter is set to ``["a", "c", "b", "z"]``,
-        then an input of ``{"a": 4, "c": 8}`` will produce an output of ``[4, 8, 0, 0]``.
+    Uses an index mapping to convert a dictionary to an array. Given a
+    dictionary, each key is looked up in the vocabulary attribute
+    corresponding to the key type. The index into the vocabulary array at
+    which the key is found is then used to index the output 1-D tensor 'Y'
+    and insert into it the value found in the dictionary 'X'. The key type
+    of the input map must correspond to the element type of the defined
+    vocabulary attribute. Therefore, the output array will be equal in
+    length to the index mapping vector parameter. All keys in the input
+    dictionary must be present in the index mapping vector. For each item in
+    the input dictionary, insert its value in the output array. Any keys not
+    present in the input dictionary, will be zero in the output array. For
+    example: if the ``string_vocabulary`` parameter is set to
+    ``["a", "c", "b", "z"]``, then an input of ``{"a": 4, "c": 8}`` will
+    produce an output of ``[4, 8, 0, 0]``.
 
     Parameters
     ==========
@@ -855,14 +846,12 @@ def dict_vectorizer(
         A dictionary.
     int64_vocabulary
         Attribute.
-        An integer vocabulary array.
-
-        One and only one of the vocabularies must be defined.
+        An integer vocabulary array.One and only one of the vocabularies must be
+        defined.
     string_vocabulary
         Attribute.
-        A string vocabulary array.
-
-        One and only one of the vocabularies must be defined.
+        A string vocabulary array.One and only one of the vocabularies must be
+        defined.
 
     Returns
     =======
@@ -899,14 +888,11 @@ def feature_vectorizer(
     inputdimensions: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    Concatenates input tensors into one continuous output.
-
-
-        All input shapes are 2-D and are concatenated along the second dimention. 1-D tensors are treated as [1,C].
-        Inputs are copied to the output maintaining the order of the input arguments.
-
-
-        All inputs must be integers or floats, while the output will be all floating point values.
+    Concatenates input tensors into one continuous output. All input shapes
+    are 2-D and are concatenated along the second dimention. 1-D tensors are
+    treated as [1,C]. Inputs are copied to the output maintaining the order
+    of the input arguments. All inputs must be integers or floats, while the
+    output will be all floating point values.
 
     Parameters
     ==========
@@ -951,22 +937,20 @@ def imputer(
     replaced_value_int64: int = 0,
 ) -> Var:
     r"""
-    Replaces inputs that equal one value with another, leaving all other elements alone.
-
-
-        This operator is typically used to replace missing values in situations where they have a canonical
-        representation, such as -1, 0, NaN, or some extreme value.
-
-
-        One and only one of imputed_value_floats or imputed_value_int64s should be defined -- floats if the input tensor
-        holds floats, integers if the input tensor holds integers. The imputed values must all fit within the
-        width of the tensor element type. One and only one of the replaced_value_float or replaced_value_int64 should be defined,
-        which one depends on whether floats or integers are being processed.
-
-
-        The imputed_value attribute length can be 1 element, or it can have one element per input feature.
-
-    In other words, if the input tensor has the shape [*,F], then the length of the attribute array may be 1 or F. If it is 1, then it is broadcast along the last dimension and applied to each feature.
+    Replaces inputs that equal one value with another, leaving all other
+    elements alone. This operator is typically used to replace missing
+    values in situations where they have a canonical representation, such as
+    -1, 0, NaN, or some extreme value. One and only one of
+    imputed_value_floats or imputed_value_int64s should be defined -- floats
+    if the input tensor holds floats, integers if the input tensor holds
+    integers. The imputed values must all fit within the width of the tensor
+    element type. One and only one of the replaced_value_float or
+    replaced_value_int64 should be defined, which one depends on whether
+    floats or integers are being processed. The imputed_value attribute
+    length can be 1 element, or it can have one element per input feature.In
+    other words, if the input tensor has the shape [*,F], then the length of
+    the attribute array may be 1 or F. If it is 1, then it is broadcast
+    along the last dimension and applied to each feature.
 
     Parameters
     ==========
@@ -1030,33 +1014,24 @@ def label_encoder(
     values_strings: Optional[Iterable[str]] = None,
 ) -> Var:
     r"""
-    Maps each element in the input tensor to another value.
-
-
-        The mapping is determined by the two parallel attributes, 'keys_*' and
-        'values_*' attribute. The i-th value in the specified 'keys_*' attribute
-        would be mapped to the i-th value in the specified 'values_*' attribute. It
-        implies that input's element type and the element type of the specified
-        'keys_*' should be identical while the output type is identical to the
-        specified 'values_*' attribute. If an input element can not be found in the
-        specified 'keys_*' attribute, the 'default_*' that matches the specified
-        'values_*' attribute may be used as its output value.
-
-
-        Let's consider an example which maps a string tensor to an integer tensor.
-        Assume and 'keys_strings' is ["Amy", "Sally"], 'values_int64s' is [5, 6],
-        and 'default_int64' is '-1'.  The input ["Dori", "Amy", "Amy", "Sally",
-        "Sally"] would be mapped to [-1, 5, 5, 6, 6].
-
-
-        Since this operator is an one-to-one mapping, its input and output shapes
-        are the same. Notice that only one of 'keys_*'/'values_*' can be set.
-
-
-        For key look-up, bit-wise comparison is used so even a float NaN can be
-        mapped to a value in 'values_*' attribute.
-
-
+    Maps each element in the input tensor to another value. The mapping is
+    determined by the two parallel attributes, 'keys\_\ *' and 'values\_*'
+    attribute. The i-th value in the specified 'keys\_\ *' attribute would
+    be mapped to the i-th value in the specified 'values\_*' attribute. It
+    implies that input's element type and the element type of the specified
+    'keys\_\ *' should be identical while the output type is identical to
+    the specified 'values\_*' attribute. If an input element can not be
+    found in the specified 'keys\_\ *' attribute, the 'default\_*' that
+    matches the specified 'values\_\ *' attribute may be used as its output
+    value. Let's consider an example which maps a string tensor to an
+    integer tensor. Assume and 'keys_strings' is ["Amy", "Sally"],
+    'values_int64s' is [5, 6], and 'default_int64' is '-1'. The input
+    ["Dori", "Amy", "Amy", "Sally", "Sally"] would be mapped to [-1, 5, 5,
+    6, 6]. Since this operator is an one-to-one mapping, its input and
+    output shapes are the same. Notice that only one of
+    'keys\_*'/'values\_\ *' can be set. For key look-up, bit-wise comparison
+    is used so even a float NaN can be mapped to a value in 'values\_*'
+    attribute.
 
     Parameters
     ==========
@@ -1147,10 +1122,12 @@ def linear_classifier(
         Data to be classified.
     classlabels_ints
         Attribute.
-        Class labels when using integer labels. One and only one 'classlabels' attribute must be defined.
+        Class labels when using integer labels. One and only one 'classlabels'
+        attribute must be defined.
     classlabels_strings
         Attribute.
-        Class labels when using string labels. One and only one 'classlabels' attribute must be defined.
+        Class labels when using string labels. One and only one 'classlabels'
+        attribute must be defined.
     coefficients
         Attribute.
         A collection of weights of the model(s).
@@ -1162,9 +1139,8 @@ def linear_classifier(
         Indicates whether to do OvR or multinomial (0=OvR is the default).
     post_transform
         Attribute.
-        Indicates the transform to apply to the scores vector.
-
-        One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
+        Indicates the transform to apply to the scores vector.One of 'NONE,'
+        'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
 
     Returns
     =======
@@ -1211,18 +1187,13 @@ def linear_regressor(
     targets: int = 1,
 ) -> Var:
     r"""
-    Generalized linear regression evaluation.
-
-
-        If targets is set to 1 (default) then univariate regression is performed.
-
-
-        If targets is set to M then M sets of coefficients must be passed in as a sequence
-        and M results will be output for each input n in N.
-
-
-        The coefficients array is of length n, and the coefficients for each target are contiguous.
-        Intercepts are optional but if provided must match the number of targets.
+    Generalized linear regression evaluation. If targets is set to 1
+    (default) then univariate regression is performed. If targets is set to
+    M then M sets of coefficients must be passed in as a sequence and M
+    results will be output for each input n in N. The coefficients array is
+    of length n, and the coefficients for each target are contiguous.
+    Intercepts are optional but if provided must match the number of
+    targets.
 
     Parameters
     ==========
@@ -1237,9 +1208,8 @@ def linear_regressor(
         Weights of the intercepts, if used.
     post_transform
         Attribute.
-        Indicates the transform to apply to the regression output vector.
-
-        One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
+        Indicates the transform to apply to the regression output vector.One of
+        'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
     targets
         Attribute.
         The total number of regression targets, 1 if not defined.
@@ -1276,28 +1246,13 @@ def normalizer(
     norm: str = "MAX",
 ) -> Var:
     r"""
-    Normalize the input.  There are three normalization modes, which have the corresponding formulas,
-        defined using element-wise infix operators '/' and '^' and tensor-wide functions 'max' and 'sum':
-
-
-
-
-
-        Max: Y = X / max(X)
-
-
-        L1:  Y = X / sum(X)
-
-
-        L2:  Y = sqrt(X^2 / sum(X^2)}
-
-
-        In all modes, if the divisor is zero, Y == X.
-
-
-
-        For batches, that is, [N,C] tensors, normalization is done along the C axis. In other words, each row
-        of the batch is normalized independently.
+    Normalize the input. There are three normalization modes, which have the
+    corresponding formulas, defined using element-wise infix operators '/'
+    and '^' and tensor-wide functions 'max' and 'sum': Max: Y = X / max(X)
+    L1: Y = X / sum(X) L2: Y = sqrt(X^2 / sum(X^2)} In all modes, if the
+    divisor is zero, Y == X. For batches, that is, [N,C] tensors,
+    normalization is done along the C axis. In other words, each row of the
+    batch is normalized independently.
 
     Parameters
     ==========
@@ -1339,20 +1294,15 @@ def one_hot_encoder(
     zeros: int = 1,
 ) -> Var:
     r"""
-    Replace each input element with an array of ones and zeros, where a single
-        one is placed at the index of the category that was passed in. The total category count
-        will determine the size of the extra dimension of the output array Y.
-
-
-        For example, if we pass a tensor with a single value of 4, and a category count of 8,
-        the output will be a tensor with ``[0,0,0,0,1,0,0,0]``.
-
-
-        This operator assumes every input feature is from the same set of categories.
-
-
-        If the input is a tensor of float, int32, or double, the data will be cast
-        to integers and the cats_int64s category list will be used for the lookups.
+    Replace each input element with an array of ones and zeros, where a
+    single one is placed at the index of the category that was passed in.
+    The total category count will determine the size of the extra dimension
+    of the output array Y. For example, if we pass a tensor with a single
+    value of 4, and a category count of 8, the output will be a tensor with
+    ``[0,0,0,0,1,0,0,0]``. This operator assumes every input feature is from
+    the same set of categories. If the input is a tensor of float, int32, or
+    double, the data will be cast to integers and the cats_int64s category
+    list will be used for the lookups.
 
     Parameters
     ==========
@@ -1361,17 +1311,16 @@ def one_hot_encoder(
         Data to be encoded.
     cats_int64s
         Attribute.
-        List of categories, ints.
-
-        One and only one of the 'cats_*' attributes must be defined.
+        List of categories, ints.One and only one of the 'cats_*' attributes
+        must be defined.
     cats_strings
         Attribute.
-        List of categories, strings.
-
-        One and only one of the 'cats_*' attributes must be defined.
+        List of categories, strings.One and only one of the 'cats_*' attributes
+        must be defined.
     zeros
         Attribute.
-        If true and category is not present, will return all zeros; if false and a category if not found, the operator will fail.
+        If true and category is not present, will return all zeros; if false and
+        a category if not found, the operator will fail.
 
     Returns
     =======
@@ -1423,36 +1372,34 @@ def svmclassifier(
         Data to be classified.
     classlabels_ints
         Attribute.
-        Class labels if using integer labels.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        Class labels if using integer labels.One and only one of the
+        'classlabels_*' attributes must be defined.
     classlabels_strings
         Attribute.
-        Class labels if using string labels.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        Class labels if using string labels.One and only one of the
+        'classlabels_*' attributes must be defined.
     coefficients
         Attribute.
 
     kernel_params
         Attribute.
-        List of 3 elements containing gamma, coef0, and degree, in that order. Zero if unused for the kernel.
+        List of 3 elements containing gamma, coef0, and degree, in that order.
+        Zero if unused for the kernel.
     kernel_type
         Attribute.
         The kernel type, one of 'LINEAR,' 'POLY,' 'RBF,' 'SIGMOID'.
     post_transform
         Attribute.
-        Indicates the transform to apply to the score.
-
-        One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
+        Indicates the transform to apply to the score. One of 'NONE,' 'SOFTMAX,'
+        'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
     prob_a
         Attribute.
         First set of probability coefficients.
     prob_b
         Attribute.
-        Second set of probability coefficients. This array must be same size as prob_a.
-
-        If these are provided then output Z are probability estimates, otherwise they are raw scores.
+        Second set of probability coefficients. This array must be same size as
+        prob_a.If these are provided then output Z are probability estimates,
+        otherwise they are raw scores.
     rho
         Attribute.
 
@@ -1470,7 +1417,9 @@ def svmclassifier(
         Classification outputs (one class per example).
     Z : Var
         Type tensor(float).
-        Class scores (one per class per example), if prob_a and prob_b are provided they are probabilities for each class, otherwise they are raw scores.
+        Class scores (one per class per example), if prob_a and prob_b are
+        provided they are probabilities for each class, otherwise they are raw
+        scores.
 
     Notes
     =====
@@ -1523,7 +1472,8 @@ def svmregressor(
     support_vectors: Optional[Iterable[float]] = None,
 ) -> Var:
     r"""
-    Support Vector Machine regression prediction and one-class SVM anomaly detection.
+    Support Vector Machine regression prediction and one-class SVM anomaly
+    detection.
 
     Parameters
     ==========
@@ -1535,7 +1485,8 @@ def svmregressor(
         Support vector coefficients.
     kernel_params
         Attribute.
-        List of 3 elements containing gamma, coef0, and degree, in that order. Zero if unused for the kernel.
+        List of 3 elements containing gamma, coef0, and degree, in that order.
+        Zero if unused for the kernel.
     kernel_type
         Attribute.
         The kernel type, one of 'LINEAR,' 'POLY,' 'RBF,' 'SIGMOID'.
@@ -1547,9 +1498,8 @@ def svmregressor(
         Flag indicating whether the regression is a one-class SVM or not.
     post_transform
         Attribute.
-        Indicates the transform to apply to the score.
-
-        One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT.'
+        Indicates the transform to apply to the score. One of 'NONE,' 'SOFTMAX,'
+        'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT.'
     rho
         Attribute.
 
@@ -1598,7 +1548,8 @@ def scaler(
     scale: Optional[Iterable[float]] = None,
 ) -> Var:
     r"""
-    Rescale input data, for example to standardize features by removing the mean and scaling to unit variance.
+    Rescale input data, for example to standardize features by removing the
+    mean and scaling to unit variance.
 
     Parameters
     ==========
@@ -1607,16 +1558,14 @@ def scaler(
         Data to be scaled.
     offset
         Attribute.
-        First, offset by this.
-
-        Can be length of features in an [N,F] tensor or length 1, in which case it applies to all features, regardless of dimension count.
+        First, offset by this.Can be length of features in an [N,F] tensor or
+        length 1, in which case it applies to all features, regardless of
+        dimension count.
     scale
         Attribute.
-        Second, multiply by this.
-
-        Can be length of features in an [N,F] tensor or length 1, in which case it applies to all features, regardless of dimension count.
-
-        Must be same length as 'offset'
+        Second, multiply by this.Can be length of features in an [N,F] tensor or
+        length 1, in which case it applies to all features, regardless of
+        dimension count.Must be same length as 'offset'
 
     Returns
     =======
@@ -1669,22 +1618,15 @@ def tree_ensemble_classifier(
 ) -> Tuple[Var, Var]:
     r"""
     Tree Ensemble classifier. Returns the top class for each of N inputs.
-
-
-        The attributes named 'nodes_X' form a sequence of tuples, associated by
-        index into the sequences, which must all be of equal length. These tuples
-        define the nodes.
-
-
-        Similarly, all fields prefixed with 'class_' are tuples of votes at the leaves.
-        A leaf may have multiple votes, where each vote is weighted by
-        the associated class_weights index.
-
-
-        One and only one of classlabels_strings or classlabels_int64s
-        will be defined. The class_ids are indices into this list.
-        All fields ending with `_as_tensor` can be used instead of the
-        same parameter without the suffix if the element type is double and not float.
+    The attributes named 'nodes_X' form a sequence of tuples, associated by
+    index into the sequences, which must all be of equal length. These
+    tuples define the nodes. Similarly, all fields prefixed with 'class_'
+    are tuples of votes at the leaves. A leaf may have multiple votes, where
+    each vote is weighted by the associated class_weights index. One and
+    only one of classlabels_strings or classlabels_int64s will be defined.
+    The class_ids are indices into this list. All fields ending with
+    \_as_tensor can be used instead of the same parameter without the suffix
+    if the element type is double and not float.
 
     Parameters
     ==========
@@ -1693,10 +1635,12 @@ def tree_ensemble_classifier(
         Input of shape [N,F]
     base_values
         Attribute.
-        Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)
+        Base values for classification, added to final class score; the size
+        must be the same as the classes or can be left unassigned (assumed 0)
     base_values_as_tensor
         Attribute.
-        Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)
+        Base values for classification, added to final class score; the size
+        must be the same as the classes or can be left unassigned (assumed 0)
     class_ids
         Attribute.
         The index of the class list that each weight is for.
@@ -1714,14 +1658,12 @@ def tree_ensemble_classifier(
         The weight for the class in class_id.
     classlabels_int64s
         Attribute.
-        Class labels if using integer labels.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        Class labels if using integer labels.One and only one of the
+        'classlabels_*' attributes must be defined.
     classlabels_strings
         Attribute.
-        Class labels if using string labels.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        Class labels if using string labels.One and only one of the
+        'classlabels_*' attributes must be defined.
     nodes_falsenodeids
         Attribute.
         Child node if expression is false.
@@ -1736,17 +1678,19 @@ def tree_ensemble_classifier(
         Popularity of each node, used for performance and may be omitted.
     nodes_missing_value_tracks_true
         Attribute.
-        For each node, define what to do in the presence of a missing value: if a value is missing (NaN), use the 'true' or 'false' branch based on the value in this array.
-
-        This attribute may be left undefined, and the defalt value is false (0) for all nodes.
+        For each node, define what to do in the presence of a missing value: if
+        a value is missing (NaN), use the 'true' or 'false' branch based on the
+        value in this array.This attribute may be left undefined, and the defalt
+        value is false (0) for all nodes.
     nodes_modes
         Attribute.
-        The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.
-
-        One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'
+        The node kind, that is, the comparison to make at the node. There is no
+        comparison to make at a leaf node.One of 'BRANCH_LEQ', 'BRANCH_LT',
+        'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'
     nodes_nodeids
         Attribute.
-        Node id for each node. Ids may restart at zero for each tree, but it not required to.
+        Node id for each node. Ids may restart at zero for each tree, but it not
+        required to.
     nodes_treeids
         Attribute.
         Tree id for each node.
@@ -1761,9 +1705,8 @@ def tree_ensemble_classifier(
         Thresholds to do the splitting on for each node.
     post_transform
         Attribute.
-        Indicates the transform to apply to the score.
-
-         One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT.'
+        Indicates the transform to apply to the score. One of 'NONE,' 'SOFTMAX,'
+        'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT.'
 
     Returns
     =======
@@ -1862,28 +1805,17 @@ def tree_ensemble_regressor(
     target_weights_as_tensor: Optional[np.ndarray] = None,
 ) -> Var:
     r"""
-    Tree Ensemble regressor.  Returns the regressed values for each input in N.
-
-
-        All args with nodes_ are fields of a tuple of tree nodes, and
-        it is assumed they are the same length, and an index i will decode the
-        tuple across these inputs.  Each node id can appear only once
-        for each tree id.
-
-
-        All fields prefixed with target_ are tuples of votes at the leaves.
-
-
-        A leaf may have multiple votes, where each vote is weighted by
-        the associated target_weights index.
-
-
-        All fields ending with `_as_tensor` can be used instead of the
-        same parameter without the suffix if the element type is double and not float.
-        All trees must have their node ids start at 0 and increment by 1.
-
-
-        Mode enum is BRANCH_LEQ, BRANCH_LT, BRANCH_GTE, BRANCH_GT, BRANCH_EQ, BRANCH_NEQ, LEAF
+    Tree Ensemble regressor. Returns the regressed values for each input in
+    N. All args with nodes\_ are fields of a tuple of tree nodes, and it is
+    assumed they are the same length, and an index i will decode the tuple
+    across these inputs. Each node id can appear only once for each tree id.
+    All fields prefixed with target\_ are tuples of votes at the leaves. A
+    leaf may have multiple votes, where each vote is weighted by the
+    associated target_weights index. All fields ending with \_as_tensor can
+    be used instead of the same parameter without the suffix if the element
+    type is double and not float. All trees must have their node ids start
+    at 0 and increment by 1. Mode enum is BRANCH_LEQ, BRANCH_LT, BRANCH_GTE,
+    BRANCH_GT, BRANCH_EQ, BRANCH_NEQ, LEAF
 
     Parameters
     ==========
@@ -1892,15 +1824,16 @@ def tree_ensemble_regressor(
         Input of shape [N,F]
     aggregate_function
         Attribute.
-        Defines how to aggregate leaf values within a target.
-
-        One of 'AVERAGE,' 'SUM,' 'MIN,' 'MAX.'
+        Defines how to aggregate leaf values within a target. One of 'AVERAGE,'
+        'SUM,' 'MIN,' 'MAX.'
     base_values
         Attribute.
-        Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)
+        Base values for classification, added to final class score; the size
+        must be the same as the classes or can be left unassigned (assumed 0)
     base_values_as_tensor
         Attribute.
-        Base values for classification, added to final class score; the size must be the same as the classes or can be left unassigned (assumed 0)
+        Base values for classification, added to final class score; the size
+        must be the same as the classes or can be left unassigned (assumed 0)
     n_targets
         Attribute.
         The total number of targets.
@@ -1918,17 +1851,19 @@ def tree_ensemble_regressor(
         Popularity of each node, used for performance and may be omitted.
     nodes_missing_value_tracks_true
         Attribute.
-        For each node, define what to do in the presence of a NaN: use the 'true' (if the attribute value is 1) or 'false' (if the attribute value is 0) branch based on the value in this array.
-
-        This attribute may be left undefined and the defalt value is false (0) for all nodes.
+        For each node, define what to do in the presence of a NaN: use the
+        'true' (if the attribute value is 1) or 'false' (if the attribute value
+        is 0) branch based on the value in this array.This attribute may be left
+        undefined and the defalt value is false (0) for all nodes.
     nodes_modes
         Attribute.
-        The node kind, that is, the comparison to make at the node. There is no comparison to make at a leaf node.
-
-        One of 'BRANCH_LEQ', 'BRANCH_LT', 'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'
+        The node kind, that is, the comparison to make at the node. There is no
+        comparison to make at a leaf node.One of 'BRANCH_LEQ', 'BRANCH_LT',
+        'BRANCH_GTE', 'BRANCH_GT', 'BRANCH_EQ', 'BRANCH_NEQ', 'LEAF'
     nodes_nodeids
         Attribute.
-        Node id for each node. Node ids must restart at zero for each tree and increase sequentially.
+        Node id for each node. Node ids must restart at zero for each tree and
+        increase sequentially.
     nodes_treeids
         Attribute.
         Tree id for each node.
@@ -1943,9 +1878,8 @@ def tree_ensemble_regressor(
         Thresholds to do the splitting on for each node.
     post_transform
         Attribute.
-        Indicates the transform to apply to the score.
-
-        One of 'NONE,' 'SOFTMAX,' 'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
+        Indicates the transform to apply to the score. One of 'NONE,' 'SOFTMAX,'
+        'LOGISTIC,' 'SOFTMAX_ZERO,' or 'PROBIT'
     target_ids
         Attribute.
         The index of the target that each weight is for
@@ -2036,16 +1970,11 @@ def zip_map(
     classlabels_strings: Optional[Iterable[str]] = None,
 ) -> Var:
     r"""
-    Creates a map from the input and the attributes.
-
-
-        The values are provided by the input tensor, while the keys are specified by the attributes.
-        Must provide keys in either classlabels_strings or classlabels_int64s (but not both).
-
-
-        The columns of the tensor correspond one-by-one to the keys specified by the attributes. There must be as many columns as keys.
-
-
+    Creates a map from the input and the attributes. The values are provided
+    by the input tensor, while the keys are specified by the attributes.
+    Must provide keys in either classlabels_strings or classlabels_int64s
+    (but not both). The columns of the tensor correspond one-by-one to the
+    keys specified by the attributes. There must be as many columns as keys.
 
     Parameters
     ==========
@@ -2054,14 +1983,12 @@ def zip_map(
         The input values
     classlabels_int64s
         Attribute.
-        The keys when using int keys.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        The keys when using int keys.One and only one of the 'classlabels_*'
+        attributes must be defined.
     classlabels_strings
         Attribute.
-        The keys when using string keys.
-
-        One and only one of the 'classlabels_*' attributes must be defined.
+        The keys when using string keys.One and only one of the 'classlabels_*'
+        attributes must be defined.
 
     Returns
     =======
diff --git a/src/spox/opset/ai/onnx/v17.py b/src/spox/opset/ai/onnx/v17.py
index 66e4ab1a..3847df7e 100644
--- a/src/spox/opset/ai/onnx/v17.py
+++ b/src/spox/opset/ai/onnx/v17.py
@@ -3647,8 +3647,8 @@ def abs(
 ) -> Var:
     r"""
     Absolute takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the absolute is, y = abs(x), is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the absolute is, y = abs(x), is applied to the tensor
+    elementwise.
 
     Parameters
     ==========
@@ -3681,7 +3681,8 @@ def acos(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the arccosine (inverse of cosine) of the given input tensor, element-wise.
+    Calculates the arccosine (inverse of cosine) of the given input tensor,
+    element-wise.
 
     Parameters
     ==========
@@ -3714,7 +3715,8 @@ def acosh(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the hyperbolic arccosine of the given input tensor element-wise.
+    Calculates the hyperbolic arccosine of the given input tensor
+    element-wise.
 
     Parameters
     ==========
@@ -3726,7 +3728,8 @@ def acosh(
     =======
     output : Var
         Type T.
-        The hyperbolic arccosine values of the input tensor computed element-wise
+        The hyperbolic arccosine values of the input tensor computed
+        element-wise
 
     Notes
     =====
@@ -3748,9 +3751,15 @@ def add(
     B: Var,
 ) -> Var:
     r"""
-    Performs element-wise binary addition (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
-    (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
+    Performs element-wise binary addition (with Numpy-style broadcasting
+    support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+
+    (Opset 14 change): Extend supported types to include uint8, int8,
+    uint16, and int16.
 
     Parameters
     ==========
@@ -3788,9 +3797,13 @@ def and_(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `and` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``and`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -3832,13 +3845,14 @@ def arg_max(
     select_last_index: int = 0,
 ) -> Var:
     r"""
-    Computes the indices of the max elements of the input tensor's element along the
-    provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
-    If keepdims equals 0, then the resulting tensor has the reduced dimension pruned.
-    If select_last_index is True (default False), the index of the last occurrence of the max
-    is selected if the max appears more than once in the input. Otherwise the index of the
-    first occurrence is selected.
-    The type of the output tensor is integer.
+    Computes the indices of the max elements of the input tensor's element
+    along the provided axis. The resulting tensor has the same rank as the
+    input if keepdims equals 1. If keepdims equals 0, then the resulting
+    tensor has the reduced dimension pruned. If select_last_index is True
+    (default False), the index of the last occurrence of the max is selected
+    if the max appears more than once in the input. Otherwise the index of
+    the first occurrence is selected. The type of the output tensor is
+    integer.
 
     Parameters
     ==========
@@ -3847,13 +3861,16 @@ def arg_max(
         An input tensor.
     axis
         Attribute.
-        The axis in which to compute the arg indices. Accepted range is [-r, r-1] where r = rank(data).
+        The axis in which to compute the arg indices. Accepted range is [-r,
+        r-1] where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
     select_last_index
         Attribute.
-        Whether to select the last index or the first index if the {name} appears in multiple indices, default is False (first index).
+        Whether to select the last index or the first index if the {name}
+        appears in multiple indices, default is False (first index).
 
     Returns
     =======
@@ -3888,13 +3905,14 @@ def arg_min(
     select_last_index: int = 0,
 ) -> Var:
     r"""
-    Computes the indices of the min elements of the input tensor's element along the
-    provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
-    If keepdims equals 0, then the resulting tensor has the reduced dimension pruned.
-    If select_last_index is True (default False), the index of the last occurrence of the min
-    is selected if the min appears more than once in the input. Otherwise the index of the
-    first occurrence is selected.
-    The type of the output tensor is integer.
+    Computes the indices of the min elements of the input tensor's element
+    along the provided axis. The resulting tensor has the same rank as the
+    input if keepdims equals 1. If keepdims equals 0, then the resulting
+    tensor has the reduced dimension pruned. If select_last_index is True
+    (default False), the index of the last occurrence of the min is selected
+    if the min appears more than once in the input. Otherwise the index of
+    the first occurrence is selected. The type of the output tensor is
+    integer.
 
     Parameters
     ==========
@@ -3903,13 +3921,16 @@ def arg_min(
         An input tensor.
     axis
         Attribute.
-        The axis in which to compute the arg indices. Accepted range is [-r, r-1] where r = rank(data).
+        The axis in which to compute the arg indices. Accepted range is [-r,
+        r-1] where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
     select_last_index
         Attribute.
-        Whether to select the last index or the first index if the {name} appears in multiple indices, default is False (first index).
+        Whether to select the last index or the first index if the {name}
+        appears in multiple indices, default is False (first index).
 
     Returns
     =======
@@ -3940,7 +3961,8 @@ def asin(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the arcsine (inverse of sine) of the given input tensor, element-wise.
+    Calculates the arcsine (inverse of sine) of the given input tensor,
+    element-wise.
 
     Parameters
     ==========
@@ -3973,7 +3995,8 @@ def asinh(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the hyperbolic arcsine of the given input tensor element-wise.
+    Calculates the hyperbolic arcsine of the given input tensor
+    element-wise.
 
     Parameters
     ==========
@@ -4006,7 +4029,8 @@ def atan(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the arctangent (inverse of tangent) of the given input tensor, element-wise.
+    Calculates the arctangent (inverse of tangent) of the given input
+    tensor, element-wise.
 
     Parameters
     ==========
@@ -4039,7 +4063,8 @@ def atanh(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the hyperbolic arctangent of the given input tensor element-wise.
+    Calculates the hyperbolic arctangent of the given input tensor
+    element-wise.
 
     Parameters
     ==========
@@ -4051,7 +4076,8 @@ def atanh(
     =======
     output : Var
         Type T.
-        The hyperbolic arctangent values of the input tensor computed element-wise
+        The hyperbolic arctangent values of the input tensor computed
+        element-wise
 
     Notes
     =====
@@ -4079,62 +4105,101 @@ def average_pool(
     strides: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    AveragePool consumes an input tensor X and applies average pooling across
-     the tensor according to kernel sizes, stride sizes, and pad lengths.
-     average pooling consisting of computing the average on all values of a
-     subset of the input tensor according to the kernel size and downsampling the
-     data into the output tensor Y for further processing. The output spatial shape will be following:
-     ```
-     output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
-     ```
-     or
-     ```
-     output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
-     ```
-     if ceil_mode is enabled
-     ```
-     * pad_shape[i] is sum of pads along axis i
-     ```
-     `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
-     ```
-     VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
-     SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
-     ```
-     And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
-     ```
-     pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
-     ```
-     The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
+    AveragePool consumes an input tensor X and applies average pooling
+    across the tensor according to kernel sizes, stride sizes, and pad
+    lengths. average pooling consisting of computing the average on all
+    values of a subset of the input tensor according to the kernel size and
+    downsampling the data into the output tensor Y for further processing.
+    The output spatial shape will be following:
+
+    ::
+
+       output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+
+    or
+
+    ::
+
+       output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+
+    if ceil_mode is enabled
+
+    ::
+
+       * pad_shape[i] is sum of pads along axis i
+
+    ``auto_pad`` is a DEPRECATED attribute. If you are using them currently,
+    the output spatial shape will be following:
+
+    ::
+
+       VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
+       SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+
+    And pad shape will be following if ``SAME_UPPER`` or ``SAME_LOWER``:
+
+    ::
+
+       pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
+
+    The output of each pooling window is divided by the number of elements
+    (exclude pad when attribute count_include_pad is zero).
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size. Optionally, if dimension denotation is in
+        effect, the operation expects the input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     ceil_mode
         Attribute.
         Whether to use ceil or floor (default) to compute the output shape.
     count_include_pad
         Attribute.
-        Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.
+        Whether include pad pixels when calculating values for the edges.
+        Default is 0, doesn't count include pad.
     kernel_shape
         Attribute.
         The size of the kernel along each axis.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor from average or max pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes. Floor value of the dimension is used
+        Output data tensor from average or max pooling across the input tensor.
+        Dimensions will vary based on various kernel, stride, and pad sizes.
+        Floor value of the dimension is used
 
     Notes
     =====
@@ -4173,40 +4238,62 @@ def batch_normalization(
     Carries out batch normalization as described in the paper
     https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
     There are five required inputs 'X', 'scale', 'B', 'input_mean' and
-    'input_var'.
-    Note that 'input_mean' and 'input_var' are expected to be the estimated
-    statistics in inference mode (training_mode=False, default),
-    and the running statistics in training mode (training_mode=True).
-    There are multiple cases for the number of outputs, which we list below:
-    Output case #1: Y, running_mean, running_var (training_mode=True)
-    Output case #2: Y (training_mode=False)
-    When training_mode=False, extra outputs are invalid.
-    The outputs are updated as follows when training_mode=True:
-    ```
-    running_mean = input_mean * momentum + current_mean * (1 - momentum)
-    running_var = input_var * momentum + current_var * (1 - momentum)
-    Y = (X - current_mean) / sqrt(current_var + epsilon) * scale + B
-    where:
-    current_mean = ReduceMean(X, axis=all_except_channel_index)
-    current_var =  ReduceVar(X, axis=all_except_channel_index)
-    Notice that ReduceVar refers to the population variance, and it equals to
-    sum(sqrd(x_i - x_avg)) / N
-    where N is the population size (this formula does not use sample size N - 1).
-    ```
-    The computation of ReduceMean and ReduceVar uses float to avoid overflow for float16 inputs.
+    'input_var'. Note that 'input_mean' and 'input_var' are expected to be
+    the estimated statistics in inference mode (training_mode=False,
+    default), and the running statistics in training mode
+    (training_mode=True). There are multiple cases for the number of
+    outputs, which we list below:
+
+    Output case #1: Y, running_mean, running_var (training_mode=True) Output
+    case #2: Y (training_mode=False)
+
+    When training_mode=False, extra outputs are invalid. The outputs are
+    updated as follows when training_mode=True:
+
+    ::
+
+       running_mean = input_mean * momentum + current_mean * (1 - momentum)
+       running_var = input_var * momentum + current_var * (1 - momentum)
+
+       Y = (X - current_mean) / sqrt(current_var + epsilon) * scale + B
+
+       where:
+
+       current_mean = ReduceMean(X, axis=all_except_channel_index)
+       current_var =  ReduceVar(X, axis=all_except_channel_index)
+
+       Notice that ReduceVar refers to the population variance, and it equals to
+       sum(sqrd(x_i - x_avg)) / N
+       where N is the population size (this formula does not use sample size N - 1).
+
+    The computation of ReduceMean and ReduceVar uses float to avoid overflow
+    for float16 inputs.
+
     When training_mode=False:
-    ```
-    Y = (X - input_mean) / sqrt(input_var + epsilon) * scale + B
-    ```
+
+    ::
+
+       Y = (X - input_mean) / sqrt(input_var + epsilon) * scale + B
+
     For previous (depreciated) non-spatial cases, implementors are suggested
-    to flatten the input shape to (N x C * D1 * D2 * ... * Dn) before a BatchNormalization Op.
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+    to flatten the input shape to (N x C \* D1 \* D2 \* ... \* Dn) before a
+    BatchNormalization Op. This operator has **optional** inputs/outputs.
+    See `the doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for
+    more details about the representation of optional arguments. An empty
+    string may be used in the place of an actual argument's name to indicate
+    a missing argument. Trailing optional arguments (those not followed by
+    an argument that is present) may also be simply omitted.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size, C is the number of channels. Statistics are computed for every channel of C over N and D1 to Dn dimensions. For image data, input dimensions become (N x C x H x W). The op also accepts single dimension input of size N in which case C is assumed to be 1
+        Input data tensor from the previous operator; dimensions are in the form
+        of (N x C x D1 x D2 ... Dn), where N is the batch size, C is the number
+        of channels. Statistics are computed for every channel of C over N and
+        D1 to Dn dimensions. For image data, input dimensions become (N x C x H
+        x W). The op also accepts single dimension input of size N in which case
+        C is assumed to be 1
     scale
         Type T1.
         Scale tensor of shape (C).
@@ -4224,10 +4311,12 @@ def batch_normalization(
         The epsilon value to use to avoid division by zero.
     momentum
         Attribute.
-        Factor used in computing the running mean and variance.e.g., running_mean = running_mean * momentum + mean * (1 - momentum).
+        Factor used in computing the running mean and variance.e.g.,
+        running_mean = running_mean \* momentum + mean \* (1 - momentum).
     training_mode
         Attribute.
-        If set to true, it indicates BatchNormalization is being used for training, and outputs 1, 2, 3, and 4 would be populated.
+        If set to true, it indicates BatchNormalization is being used for
+        training, and outputs 1, 2, 3, and 4 would be populated.
 
     Returns
     =======
@@ -4239,7 +4328,9 @@ def batch_normalization(
         The running mean after the BatchNormalization operator.
     running_var : Var
         Type T2.
-        The running variance after the BatchNormalization operator. This op uses the population size (N) for calculating variance, and not the sample size N-1.
+        The running variance after the BatchNormalization operator. This op uses
+        the population size (N) for calculating variance, and not the sample
+        size N-1.
 
     Notes
     =====
@@ -4273,11 +4364,14 @@ def bernoulli(
     seed: Optional[float] = None,
 ) -> Var:
     r"""
-    Draws binary random numbers (0 or 1) from a Bernoulli distribution. The input tensor should be a tensor
-    containing probabilities p (a value in the range [0,1]) to be used for drawing the binary random number,
-    where an output of 1 is produced with probability p and an output of 0 is produced with probability (1-p).
-    This operator is non-deterministic and may not produce the same values in different
-    implementations (even if a seed is specified).
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution. The
+    input tensor should be a tensor containing probabilities p (a value in
+    the range [0,1]) to be used for drawing the binary random number, where
+    an output of 1 is produced with probability p and an output of 0 is
+    produced with probability (1-p).
+
+    This operator is non-deterministic and may not produce the same values
+    in different implementations (even if a seed is specified).
 
     Parameters
     ==========
@@ -4286,16 +4380,19 @@ def bernoulli(
         All values in input have to be in the range:[0, 1].
     dtype
         Attribute.
-        The data type for the elements of the output tensor. if not specified, we will use the data type of the input tensor.
+        The data type for the elements of the output tensor. if not specified,
+        we will use the data type of the input tensor.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
 
     Returns
     =======
     output : Var
         Type T2.
-        The returned output tensor only has values 0 or 1, same shape as input tensor.
+        The returned output tensor only has values 0 or 1, same shape as input
+        tensor.
 
     Notes
     =====
@@ -4323,17 +4420,22 @@ def bit_shift(
     direction: str,
 ) -> Var:
     r"""
-    Bitwise shift operator performs element-wise operation. For each input element, if the
-     attribute "direction" is "RIGHT", this operator moves its binary representation toward
-     the right side so that the input value is effectively decreased. If the attribute "direction"
-     is "LEFT", bits of binary representation moves toward the left side, which results the
-     increase of its actual value. The input X is the tensor to be shifted and another input
-     Y specifies the amounts of shifting. For example, if "direction" is "Right", X is [1, 4],
-     and S is [1, 1], the corresponding output Z would be [0, 2]. If "direction" is "LEFT" with
-     X=[1, 2] and S=[1, 2], the corresponding output Y would be [2, 8].
-     Because this operator supports Numpy-style broadcasting, X's and Y's shapes are
-     not necessarily identical.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Bitwise shift operator performs element-wise operation. For each input
+    element, if the attribute "direction" is "RIGHT", this operator moves
+    its binary representation toward the right side so that the input value
+    is effectively decreased. If the attribute "direction" is "LEFT", bits
+    of binary representation moves toward the left side, which results the
+    increase of its actual value. The input X is the tensor to be shifted
+    and another input Y specifies the amounts of shifting. For example, if
+    "direction" is "Right", X is [1, 4], and S is [1, 1], the corresponding
+    output Z would be [0, 2]. If "direction" is "LEFT" with X=[1, 2] and
+    S=[1, 2], the corresponding output Y would be [2, 8].
+
+    Because this operator supports Numpy-style broadcasting, X's and Y's
+    shapes are not necessarily identical. This operator supports
+    **multidirectional (i.e., Numpy-style) broadcasting**; for more details
+    please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -4345,7 +4447,8 @@ def bit_shift(
         Second operand, amounts of shift.
     direction
         Attribute.
-        Direction of moving bits. It can be either "RIGHT" (for right shift) or "LEFT" (for left shift).
+        Direction of moving bits. It can be either "RIGHT" (for right shift) or
+        "LEFT" (for left shift).
 
     Returns
     =======
@@ -4378,7 +4481,8 @@ def blackman_window(
     periodic: int = 1,
 ) -> Var:
     r"""
-    Generates a Blackman window as described in the paper https://ieeexplore.ieee.org/document/1455106.
+    Generates a Blackman window as described in the paper
+    https://ieeexplore.ieee.org/document/1455106.
 
     Parameters
     ==========
@@ -4387,10 +4491,15 @@ def blackman_window(
         A scalar value indicating the length of the window.
     output_datatype
         Attribute.
-        The data type of the output tensor. Strictly must be one of the values from DataType enum in TensorProto whose values correspond to T2. The default value is 1 = FLOAT.
+        The data type of the output tensor. Strictly must be one of the values
+        from DataType enum in TensorProto whose values correspond to T2. The
+        default value is 1 = FLOAT.
     periodic
         Attribute.
-        If 1, returns a window to be used as periodic function. If 0, return a symmetric window. When 'periodic' is specified, hann computes a window of length size + 1 and returns the first size points. The default value is 1.
+        If 1, returns a window to be used as periodic function. If 0, return a
+        symmetric window. When 'periodic' is specified, hann computes a window
+        of length size + 1 and returns the first size points. The default value
+        is 1.
 
     Returns
     =======
@@ -4424,36 +4533,55 @@ def cast(
 ) -> Var:
     r"""
     The operator casts the elements of a given input tensor to a data type
-    specified by the 'to' argument and returns an output tensor of the same size in
-    the converted type. The 'to' argument must be one of the data types specified
-    in the 'DataType' enum field in the TensorProto message.
-    Casting from string tensor in plain (e.g., "3.14" and "1000") and scientific numeric representations
-    (e.g., "1e-5" and "1E8") to float types is supported. For example, converting string "100.5" to an integer may
-    result 100. There are some string literals reserved for special floating-point values;
-    "+INF" (and "INF"), "-INF", and "NaN" are positive infinity, negative infinity, and not-a-number, respectively.
-    Any string which can exactly match "+INF" in a case-insensitive way would be mapped to positive infinite. Similarly,
-    this case-insensitive rule is applied to "INF" and "NaN". When casting from numeric tensors
-    to string tensors, plain floating-point representation (such as "314.15926") would be used.
-    Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases
-    of converting string representing floating-point arithmetic value, such as "2.718", to INT is an undefined behavior.
-    Conversion from a numerical type to any numerical type is always allowed.
-    User must be aware of precision loss and value change caused by range difference between two types.
-    For example, a 64-bit float 3.1415926459 may be round to a 32-bit float 3.141592. Similarly, converting
-    an integer 36 to Boolean may produce 1 because we truncate bits which can't be stored in the targeted type.
-    In more detail, the conversion among numerical types should follow these rules:
-    * Casting from floating point to:
-      * floating point: +/- infinity if OOR (out of range).
-      * fixed point: undefined if OOR.
-      * bool: +/- 0.0 to False; all else to True.
-    * Casting from fixed point to:
-      * floating point: +/- infinity if OOR. (+ infinity in the case of uint)
-      * fixed point: when OOR, discard higher bits and reinterpret (with respect to two's complement representation for
-    signed types). For example, 200 (int16) -> -56 (int8).
-      * bool: zero to False; nonzero to True.
-    * Casting from bool to:
-      * floating point: `{1.0, 0.0}`.
-      * fixed point: `{1, 0}`.
-      * bool: no change.
+    specified by the 'to' argument and returns an output tensor of the same
+    size in the converted type. The 'to' argument must be one of the data
+    types specified in the 'DataType' enum field in the TensorProto message.
+
+    Casting from string tensor in plain (e.g., "3.14" and "1000") and
+    scientific numeric representations (e.g., "1e-5" and "1E8") to float
+    types is supported. For example, converting string "100.5" to an integer
+    may result 100. There are some string literals reserved for special
+    floating-point values; "+INF" (and "INF"), "-INF", and "NaN" are
+    positive infinity, negative infinity, and not-a-number, respectively.
+    Any string which can exactly match "+INF" in a case-insensitive way
+    would be mapped to positive infinite. Similarly, this case-insensitive
+    rule is applied to "INF" and "NaN". When casting from numeric tensors to
+    string tensors, plain floating-point representation (such as
+    "314.15926") would be used. Converting non-numerical-literal string such
+    as "Hello World!" is an undefined behavior. Cases of converting string
+    representing floating-point arithmetic value, such as "2.718", to INT is
+    an undefined behavior.
+
+    Conversion from a numerical type to any numerical type is always
+    allowed. User must be aware of precision loss and value change caused by
+    range difference between two types. For example, a 64-bit float
+    3.1415926459 may be round to a 32-bit float 3.141592. Similarly,
+    converting an integer 36 to Boolean may produce 1 because we truncate
+    bits which can't be stored in the targeted type.
+
+    In more detail, the conversion among numerical types should follow these
+    rules:
+
+    -  Casting from floating point to:
+
+       -  floating point: +/- infinity if OOR (out of range).
+       -  fixed point: undefined if OOR.
+       -  bool: +/- 0.0 to False; all else to True.
+
+    -  Casting from fixed point to:
+
+       -  floating point: +/- infinity if OOR. (+ infinity in the case of
+          uint)
+       -  fixed point: when OOR, discard higher bits and reinterpret (with
+          respect to two's complement representation for signed types). For
+          example, 200 (int16) -> -56 (int8).
+       -  bool: zero to False; nonzero to True.
+
+    -  Casting from bool to:
+
+       -  floating point: ``{1.0, 0.0}``.
+       -  fixed point: ``{1, 0}``.
+       -  bool: no change.
 
     Parameters
     ==========
@@ -4462,13 +4590,15 @@ def cast(
         Input tensor to be cast.
     to
         Attribute.
-        The data type to which the elements of the input tensor are cast. Strictly must be one of the types from DataType enum in TensorProto
+        The data type to which the elements of the input tensor are cast.
+        Strictly must be one of the types from DataType enum in TensorProto
 
     Returns
     =======
     output : Var
         Type T2.
-        Output tensor with the same shape as input with type specified by the 'to' argument
+        Output tensor with the same shape as input with type specified by the
+        'to' argument
 
     Notes
     =====
@@ -4493,8 +4623,8 @@ def cast_like(
     target_type: Var,
 ) -> Var:
     r"""
-    The operator casts the elements of a given input tensor (the first input) to
-    the same data type as the elements of the second input tensor.
+    The operator casts the elements of a given input tensor (the first
+    input) to the same data type as the elements of the second input tensor.
     See documentation of the Cast operator for further details.
 
     Parameters
@@ -4504,13 +4634,15 @@ def cast_like(
         Input tensor to be cast.
     target_type
         Type T2.
-        The (first) input tensor will be cast to produce a tensor of the same type as this (second input) tensor.
+        The (first) input tensor will be cast to produce a tensor of the same
+        type as this (second input) tensor.
 
     Returns
     =======
     output : Var
         Type T2.
-        Output tensor produced by casting the first input tensor to have the same type as the second input tensor.
+        Output tensor produced by casting the first input tensor to have the
+        same type as the second input tensor.
 
     Notes
     =====
@@ -4534,8 +4666,8 @@ def ceil(
 ) -> Var:
     r"""
     Ceil takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the ceil is, y = ceil(x), is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the ceil is, y = ceil(x), is applied to the tensor
+    elementwise.
 
     Parameters
     ==========
@@ -4570,12 +4702,12 @@ def celu(
     alpha: float = 1.0,
 ) -> Var:
     r"""
-    Continuously Differentiable Exponential Linear Units:
-    Perform the linear unit element-wise on the input tensor X
-    using formula:
-    ```
-    max(0,x) + min(0,alpha*(exp(x/alpha)-1))
-    ```
+    Continuously Differentiable Exponential Linear Units: Perform the linear
+    unit element-wise on the input tensor X using formula:
+
+    ::
+
+       max(0,x) + min(0,alpha*(exp(x/alpha)-1))
 
     Parameters
     ==========
@@ -4584,7 +4716,8 @@ def celu(
         Input tensor
     alpha
         Attribute.
-        The Alpha value in Celu formula which control the shape of the unit. The default value is 1.0.
+        The Alpha value in Celu formula which control the shape of the unit. The
+        default value is 1.0.
 
     Returns
     =======
@@ -4626,10 +4759,12 @@ def clip(
         Input tensor whose elements to be clipped
     min
         Type T.
-        Minimum value, under which element is replaced by min. It must be a scalar(tensor of empty shape).
+        Minimum value, under which element is replaced by min. It must be a
+        scalar(tensor of empty shape).
     max
         Type T.
-        Maximum value, above which element is replaced by max. It must be a scalar(tensor of empty shape).
+        Maximum value, above which element is replaced by max. It must be a
+        scalar(tensor of empty shape).
 
     Returns
     =======
@@ -4661,9 +4796,11 @@ def compress(
     axis: Optional[int] = None,
 ) -> Var:
     r"""
-    Selects slices from an input tensor along a given axis where condition evaluates to True for each axis index.
-        In case axis is not provided, input is flattened before elements are selected.
-        Compress behaves like numpy.compress: https://docs.scipy.org/doc/numpy/reference/generated/numpy.compress.html
+    Selects slices from an input tensor along a given axis where condition
+    evaluates to True for each axis index. In case axis is not provided,
+    input is flattened before elements are selected. Compress behaves like
+    numpy.compress:
+    https://docs.scipy.org/doc/numpy/reference/generated/numpy.compress.html
 
     Parameters
     ==========
@@ -4672,16 +4809,23 @@ def compress(
         Tensor of rank r >= 1.
     condition
         Type T1.
-        Rank 1 tensor of booleans to indicate which slices or data elements to be selected. Its length can be less than the input length along the axis or the flattened input size if axis is not specified. In such cases data slices or elements exceeding the condition length are discarded.
+        Rank 1 tensor of booleans to indicate which slices or data elements to
+        be selected. Its length can be less than the input length along the axis
+        or the flattened input size if axis is not specified. In such cases data
+        slices or elements exceeding the condition length are discarded.
     axis
         Attribute.
-        (Optional) Axis along which to take slices. If not specified, input is flattened before elements being selected. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        (Optional) Axis along which to take slices. If not specified, input is
+        flattened before elements being selected. Negative value means counting
+        dimensions from the back. Accepted range is [-r, r-1] where r =
+        rank(input).
 
     Returns
     =======
     output : Var
         Type T.
-        Tensor of rank r if axis is specified. Otherwise output is a Tensor of rank 1.
+        Tensor of rank r if axis is specified. Otherwise output is a Tensor of
+        rank 1.
 
     Notes
     =====
@@ -4708,7 +4852,9 @@ def concat(
     axis: int,
 ) -> Var:
     r"""
-    Concatenate a list of tensors into a single tensor. All input tensors must have the same shape, except for the dimension size of the axis to concatenate on.
+    Concatenate a list of tensors into a single tensor. All input tensors
+    must have the same shape, except for the dimension size of the axis to
+    concatenate on.
 
     Parameters
     ==========
@@ -4717,7 +4863,8 @@ def concat(
         List of tensors for concatenation
     axis
         Attribute.
-        Which axis to concat on. A negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(inputs)..
+        Which axis to concat on. A negative value means counting dimensions from
+        the back. Accepted range is [-r, r-1] where r = rank(inputs)..
 
     Returns
     =======
@@ -4749,10 +4896,11 @@ def concat_from_sequence(
     new_axis: int = 0,
 ) -> Var:
     r"""
-    Concatenate a sequence of tensors into a single tensor.
-    All input tensors must have the same shape, except for the dimension size of the axis to concatenate on.
-    By default 'new_axis' is 0, the behavior is similar to numpy.concatenate.
-    When 'new_axis' is 1, the behavior is similar to numpy.stack.
+    Concatenate a sequence of tensors into a single tensor. All input
+    tensors must have the same shape, except for the dimension size of the
+    axis to concatenate on. By default 'new_axis' is 0, the behavior is
+    similar to numpy.concatenate. When 'new_axis' is 1, the behavior is
+    similar to numpy.stack.
 
     Parameters
     ==========
@@ -4761,10 +4909,13 @@ def concat_from_sequence(
         Sequence of tensors for concatenation
     axis
         Attribute.
-        Which axis to concat on. Accepted range in `[-r, r - 1]`, where `r` is the rank of input tensors. When `new_axis` is 1, accepted range is `[-r - 1, r]`.
+        Which axis to concat on. Accepted range in ``[-r, r - 1]``, where ``r``
+        is the rank of input tensors. When ``new_axis`` is 1, accepted range is
+        ``[-r - 1, r]``.
     new_axis
         Attribute.
-        Insert and concatenate on a new axis or not, default 0 means do not insert new axis.
+        Insert and concatenate on a new axis or not, default 0 means do not
+        insert new axis.
 
     Returns
     =======
@@ -4802,8 +4953,8 @@ def constant(
     value_strings: Optional[Iterable[str]] = None,
 ) -> Var:
     r"""
-    This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
-    or value_* must be specified.
+    This operator produces a constant tensor. Exactly one of the provided
+    attributes, either value, sparse_value, or value_\* must be specified.
 
     Parameters
     ==========
@@ -4827,7 +4978,8 @@ def constant(
         The values for the elements for the 1D, int64, output tensor.
     value_string
         Attribute.
-        The value for the sole element for the scalar, UTF-8 string, output tensor.
+        The value for the sole element for the scalar, UTF-8 string, output
+        tensor.
     value_strings
         Attribute.
         The values for the elements for the 1D, UTF-8 string, output tensor.
@@ -4871,16 +5023,22 @@ def constant_of_shape(
     ==========
     input
         Type T1.
-        1D tensor. The shape of the expected output tensor. If empty tensor is given, the output would be a scalar. All values must be >= 0.
+        1D tensor. The shape of the expected output tensor. If empty tensor is
+        given, the output would be a scalar. All values must be >= 0.
     value
         Attribute.
-        (Optional) The value of the output elements.Should be a one-element tensor. If not specified, it defaults to a tensor of value 0 and datatype float32
+        (Optional) The value of the output elements.Should be a one-element
+        tensor. If not specified, it defaults to a tensor of value 0 and
+        datatype float32
 
     Returns
     =======
     output : Var
         Type T2.
-        Output tensor of shape specified by 'input'.If attribute 'value' is specified, the value and datatype of the output tensor is taken from 'value'.If attribute 'value' is not specified, the value in the output defaults to 0, and the datatype defaults to float32.
+        Output tensor of shape specified by 'input'.If attribute 'value' is
+        specified, the value and datatype of the output tensor is taken from
+        'value'.If attribute 'value' is not specified, the value in the output
+        defaults to 0, and the datatype defaults to float32.
 
     Notes
     =====
@@ -4920,37 +5078,75 @@ def conv(
     ==========
     X
         Type T.
-        Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in effect, the operation expects input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from previous layer; has size (N x C x H x W), where N
+        is the batch size, C is the number of channels, and H and W are the
+        height and width. Note that this is for the 2D image. Otherwise the size
+        is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in
+        effect, the operation expects input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     W
         Type T.
-        The weight tensor that will be used in the convolutions; has size (M x C/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel. Optionally, if dimension denotation is in effect, the operation expects the weight tensor to arrive with the dimension denotation of [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. Assuming zero based indices for the shape array, X.shape[1] == (W.shape[1] * group) == C and W.shape[0] mod G == 0. Or in other words FILTER_IN_CHANNEL multiplied by the number of groups should be equal to DATA_CHANNEL and the number of feature maps M should be a multiple of the number of groups G.
+        The weight tensor that will be used in the convolutions; has size (M x
+        C/group x kH x kW), where C is the number of channels, and kH and kW are
+        the height and width of the kernel, and M is the number of feature maps.
+        For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x
+        k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel.
+        Optionally, if dimension denotation is in effect, the operation expects
+        the weight tensor to arrive with the dimension denotation of
+        [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL
+        ...]. Assuming zero based indices for the shape array, X.shape[1] ==
+        (W.shape[1] \* group) == C and W.shape[0] mod G == 0. Or in other words
+        FILTER_IN_CHANNEL multiplied by the number of groups should be equal to
+        DATA_CHANNEL and the number of feature maps M should be a multiple of
+        the number of groups G.
     B
         Type T.
         Optional 1D bias to be added to the convolution, has size of M.
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     dilations
         Attribute.
-        dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.
+        dilation value along each spatial axis of the filter. If not present,
+        the dilation defaults is 1 along each spatial axis.
     group
         Attribute.
         number of groups input channels and output channels are divided into.
     kernel_shape
         Attribute.
-        The shape of the convolution kernel. If not present, should be inferred from input W.
+        The shape of the convolution kernel. If not present, should be inferred
+        from input W.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults is 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults is 1
+        along each spatial axis.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, and pad lengths.
+        Output data tensor that contains the result of the convolution. The
+        output dimensions are functions of the kernel size, stride size, and pad
+        lengths.
 
     Notes
     =====
@@ -4990,47 +5186,90 @@ def conv_integer(
     strides: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    The integer convolution operator consumes an input tensor, its zero-point, a filter, and its zero-point,
-    and computes the output. The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.
+    The integer convolution operator consumes an input tensor, its
+    zero-point, a filter, and its zero-point, and computes the output. The
+    production MUST never overflow. The accumulation may overflow if and
+    only if in 32 bits.
 
     Parameters
     ==========
     x
         Type T1.
-        Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in effect, the operation expects input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from previous layer; has size (N x C x H x W), where N
+        is the batch size, C is the number of channels, and H and W are the
+        height and width. Note that this is for the 2D image. Otherwise the size
+        is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in
+        effect, the operation expects input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     w
         Type T2.
-        The weight tensor that will be used in the convolutions; has size (M x C/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel. Optionally, if dimension denotation is in effect, the operation expects the weight tensor to arrive with the dimension denotation of [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. X.shape[1] == (W.shape[1] * group) == C (assuming zero based indices for the shape array). Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL.
+        The weight tensor that will be used in the convolutions; has size (M x
+        C/group x kH x kW), where C is the number of channels, and kH and kW are
+        the height and width of the kernel, and M is the number of feature maps.
+        For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x
+        k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel.
+        Optionally, if dimension denotation is in effect, the operation expects
+        the weight tensor to arrive with the dimension denotation of
+        [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL
+        ...]. X.shape[1] == (W.shape[1] \* group) == C (assuming zero based
+        indices for the shape array). Or in other words FILTER_IN_CHANNEL should
+        be equal to DATA_CHANNEL.
     x_zero_point
         Type T1.
-        Zero point tensor for input 'x'. It's optional and default value is 0. It's a scalar, which means a per-tensor/layer quantization.
+        Zero point tensor for input 'x'. It's optional and default value is 0.
+        It's a scalar, which means a per-tensor/layer quantization.
     w_zero_point
         Type T2.
-        Zero point tensor for input 'w'. It's optional and default value is 0.  It could be a scalar or a 1-D tensor, which means a per-tensor/layer or per output channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of output channels (M)
+        Zero point tensor for input 'w'. It's optional and default value is 0.
+        It could be a scalar or a 1-D tensor, which means a per-tensor/layer or
+        per output channel quantization. If it's a 1-D tensor, its number of
+        elements should be equal to the number of output channels (M)
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     dilations
         Attribute.
-        dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each axis.
+        dilation value along each spatial axis of the filter. If not present,
+        the dilation defaults to 1 along each axis.
     group
         Attribute.
-        number of groups input channels and output channels are divided into. default is 1.
+        number of groups input channels and output channels are divided into.
+        default is 1.
     kernel_shape
         Attribute.
-        The shape of the convolution kernel. If not present, should be inferred from input 'w'.
+        The shape of the convolution kernel. If not present, should be inferred
+        from input 'w'.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0.The value represent the number of pixels added to the beginning and end part of the corresponding axis.`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number ofpixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`.This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaultsto 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0.The value represent the number
+        of pixels added to the beginning and end part of the corresponding
+        axis.\ ``pads`` format should be as follow [x1_begin, x2_begin...x1_end,
+        x2_end,...], where xi_begin the number ofpixels added at the beginning
+        of axis ``i`` and xi_end, the number of pixels added at the end of axis
+        ``i``.This attribute cannot be used simultaneously with auto_pad
+        attribute. If not present, the padding defaultsto 0 along start and end
+        of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each axis.
 
     Returns
     =======
     y : Var
         Type T3.
-        Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, and pad lengths.
+        Output data tensor that contains the result of the convolution. The
+        output dimensions are functions of the kernel size, stride size, and pad
+        lengths.
 
     Notes
     =====
@@ -5074,56 +5313,109 @@ def conv_transpose(
     strides: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    The convolution transpose operator consumes an input tensor and a filter,
-    and computes the output.
-    If the pads parameter is provided the shape of the output is calculated via the following equation:
-      output_shape[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - pads[start_i] - pads[end_i]
-    output_shape can also be explicitly specified in which case pads values are auto generated using these equations:
-      total_padding[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]
-      If (auto_pads == SAME_UPPER): pads[start_i] = total_padding[i]/2; pads[end_i] = total_padding[i] - (total_padding[i]/2)
-      Else: pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] = (total_padding[i]/2).
+    The convolution transpose operator consumes an input tensor and a
+    filter, and computes the output.
+
+    If the pads parameter is provided the shape of the output is calculated
+    via the following equation:
+
+    output_shape[i] = stride[i] \* (input_size[i] - 1) + output_padding[i] +
+    ((kernel_shape[i] - 1) \* dilations[i] + 1) - pads[start_i] -
+    pads[end_i]
+
+    output_shape can also be explicitly specified in which case pads values
+    are auto generated using these equations:
+
+    total_padding[i] = stride[i] \* (input_size[i] - 1) + output_padding[i]
+    + ((kernel_shape[i] - 1) \* dilations[i] + 1) - output_shape[i] If
+    (auto_pads == SAME_UPPER): pads[start_i] = total_padding[i]/2;
+    pads[end_i] = total_padding[i] - (total_padding[i]/2) Else:
+    pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] =
+    (total_padding[i]/2).
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn)
+        Input data tensor from previous layer; has size (N x C x H x W), where N
+        is the batch size, C is the number of channels, and H and W are the
+        height and width. Note that this is for the 2D image. Otherwise the size
+        is (N x C x D1 x D2 ... x Dn)
     W
         Type T.
-        The weight tensor that will be used in the convolutions; has size (C x M/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the weight shape will be (C x M/group x k1 x k2 x ... x kn), where (k1 x k2 x ... x kn) is the dimension of the kernel. The number of channels in the output should be equal to W.shape[1] * group (assuming zero based indices of the shape array)
+        The weight tensor that will be used in the convolutions; has size (C x
+        M/group x kH x kW), where C is the number of channels, and kH and kW are
+        the height and width of the kernel, and M is the number of feature maps.
+        For more than 2 dimensions, the weight shape will be (C x M/group x k1 x
+        k2 x ... x kn), where (k1 x k2 x ... x kn) is the dimension of the
+        kernel. The number of channels in the output should be equal to
+        W.shape[1] \* group (assuming zero based indices of the shape array)
     B
         Type T.
         Optional 1D bias to be added to the convolution, has size of M.
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = input_shape[i] * strides[i]` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = input_shape[i] * strides[i]`` for each axis ``i``.
+        The padding is split between the two sides equally or almost equally
+        (depending on whether it is even or odd). In case the padding is an odd
+        number, the extra padding is added at the end for SAME_UPPER and at the
+        beginning for SAME_LOWER.
     dilations
         Attribute.
-        dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each spatial axis.
+        dilation value along each spatial axis of the filter. If not present,
+        the dilation defaults to 1 along each spatial axis.
     group
         Attribute.
         number of groups input channels and output channels are divided into.
     kernel_shape
         Attribute.
-        The shape of the convolution kernel. If not present, should be inferred from input W.
+        The shape of the convolution kernel. If not present, should be inferred
+        from input W.
     output_padding
         Attribute.
-        Additional elements added to the side with higher coordinate indices in the output. Each padding value in "output_padding" must be less than the corresponding stride/dilation dimension. By default, this attribute is a zero vector. Note that this attribute doesn't directly affect the computed output values. It only controls the selection of the computed values, so changing this attribute only adds or removes output elements. If "output_shape" is explicitly provided, "output_padding" does not contribute additional size to "output_shape" but participates in the computation of the needed padding amount. This is also called adjs or adjustment in some frameworks.
+        Additional elements added to the side with higher coordinate indices in
+        the output. Each padding value in "output_padding" must be less than the
+        corresponding stride/dilation dimension. By default, this attribute is a
+        zero vector. Note that this attribute doesn't directly affect the
+        computed output values. It only controls the selection of the computed
+        values, so changing this attribute only adds or removes output elements.
+        If "output_shape" is explicitly provided, "output_padding" does not
+        contribute additional size to "output_shape" but participates in the
+        computation of the needed padding amount. This is also called adjs or
+        adjustment in some frameworks.
     output_shape
         Attribute.
-        The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads
+        The shape of the output can be explicitly set which will cause pads
+        values to be auto generated. If output_shape is specified pads values
+        are ignored. See doc for details for equations to generate pads
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, pad lengths and group count. The number of channels in the output should be equal to W.shape[1] * group (assuming zero based indices of the shape array)
+        Output data tensor that contains the result of the convolution. The
+        output dimensions are functions of the kernel size, stride size, pad
+        lengths and group count. The number of channels in the output should be
+        equal to W.shape[1] \* group (assuming zero based indices of the shape
+        array)
 
     Notes
     =====
@@ -5227,24 +5519,28 @@ def cum_sum(
     reverse: int = 0,
 ) -> Var:
     r"""
-    Performs cumulative sum of the input elements along the given axis.
-    By default, it will do the sum inclusively meaning the first element is copied as is.
-    Through an `exclusive` attribute, this behavior can change to exclude the first element.
-    It can also perform summation in the opposite direction of the axis. For that, set `reverse` attribute to 1.
+    Performs cumulative sum of the input elements along the given axis. By
+    default, it will do the sum inclusively meaning the first element is
+    copied as is. Through an ``exclusive`` attribute, this behavior can
+    change to exclude the first element. It can also perform summation in
+    the opposite direction of the axis. For that, set ``reverse`` attribute
+    to 1.
+
     Example:
-    ```
-    input_x = [1, 2, 3]
-    axis=0
-    output = [1, 3, 6]
-    exclusive=1
-    output = [0, 1, 3]
-    exclusive=0
-    reverse=1
-    output = [6, 5, 3]
-    exclusive=1
-    reverse=1
-    output = [5, 3, 0]
-    ```
+
+    ::
+
+       input_x = [1, 2, 3]
+       axis=0
+       output = [1, 3, 6]
+       exclusive=1
+       output = [0, 1, 3]
+       exclusive=0
+       reverse=1
+       output = [6, 5, 3]
+       exclusive=1
+       reverse=1
+       output = [5, 3, 0]
 
     Parameters
     ==========
@@ -5253,10 +5549,14 @@ def cum_sum(
         An input tensor that is to be processed.
     axis
         Type T2.
-        A 0-D tensor. Must be in the range [-rank(x), rank(x)-1]. Negative value means counting dimensions from the back.
+        A 0-D tensor. Must be in the range [-rank(x), rank(x)-1]. Negative value
+        means counting dimensions from the back.
     exclusive
         Attribute.
-        If set to 1 will return exclusive sum in which the top element is not included. In other terms, if set to 1, the j-th output element would be the sum of the first (j-1) elements. Otherwise, it would be the sum of the first j elements.
+        If set to 1 will return exclusive sum in which the top element is not
+        included. In other terms, if set to 1, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
     reverse
         Attribute.
         If set to 1 will perform the sums in reverse direction.
@@ -5265,7 +5565,8 @@ def cum_sum(
     =======
     y : Var
         Type T.
-        Output tensor of the same type as 'x' with cumulative sums of the x's elements
+        Output tensor of the same type as 'x' with cumulative sums of the x's
+        elements
 
     Notes
     =====
@@ -5302,25 +5603,51 @@ def dft(
     ==========
     input
         Type T1.
-        For real input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first dimension is the batch dimension. The following N dimentions correspond to the signal's dimensions. The final dimension represents the real and imaginary parts of the value in that order.
+        For real input, the following shape is expected:
+        [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex
+        input, the following shape is expected:
+        [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first
+        dimension is the batch dimension. The following N dimentions correspond
+        to the signal's dimensions. The final dimension represents the real and
+        imaginary parts of the value in that order.
     dft_length
         Type T2.
-        The length of the signal.If greater than the axis dimension, the signal will be zero-padded up to dft_length. If less than the axis dimension, only the first dft_length values will be used as the signal. It's an optional value.
+        The length of the signal.If greater than the axis dimension, the signal
+        will be zero-padded up to dft_length. If less than the axis dimension,
+        only the first dft_length values will be used as the signal. It's an
+        optional value.
     axis
         Attribute.
-        The axis on which to perform the DFT. By default this value is set to 1, which corresponds to the first dimension after the batch index.
+        The axis on which to perform the DFT. By default this value is set to 1,
+        which corresponds to the first dimension after the batch index.
     inverse
         Attribute.
-        Whether to perform the inverse discrete fourier transform. By default this value is set to 0, which corresponds to false.
+        Whether to perform the inverse discrete fourier transform. By default
+        this value is set to 0, which corresponds to false.
     onesided
         Attribute.
-        If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if the input or window tensors are complex, then onesided output is not possible. Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT). When invoked with real or complex valued input, the default value is 0. Values can be 0 or 1.
+        If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) +
+        1] are returned because the real-to-complex Fourier transform satisfies
+        the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if
+        the input or window tensors are complex, then onesided output is not
+        possible. Enabling onesided with real inputs performs a Real-valued fast
+        Fourier transform (RFFT). When invoked with real or complex valued
+        input, the default value is 0. Values can be 0 or 1.
 
     Returns
     =======
     output : Var
         Type T1.
-        The Fourier Transform of the input vector.If onesided is 0, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. If axis=0 and onesided is 1, the following shape is expected: [batch_idx][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]. If axis=1 and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][floor(signal_dim2/2)+1]...[signal_dimN][2]. If axis=N-1 and onesided is 1, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]. The signal_dim at the specified axis is equal to the dft_length.
+        The Fourier Transform of the input vector.If onesided is 0, the
+        following shape is expected:
+        [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. If axis=0 and
+        onesided is 1, the following shape is expected:
+        [batch_idx][floor(signal_dim1/2)+1][signal_dim2]...[signal_dimN][2]. If
+        axis=1 and onesided is 1, the following shape is expected:
+        [batch_idx][signal_dim1][floor(signal_dim2/2)+1]...[signal_dimN][2]. If
+        axis=N-1 and onesided is 1, the following shape is expected:
+        [batch_idx][signal_dim1][signal_dim2]...[floor(signal_dimN/2)+1][2]. The
+        signal_dim at the specified axis is equal to the dft_length.
 
     Notes
     =====
@@ -5350,40 +5677,59 @@ def depth_to_space(
     mode: str = "DCR",
 ) -> Var:
     r"""
-    DepthToSpace rearranges (permutes) data from depth into blocks of spatial data.
-    This is the reverse transformation of SpaceToDepth. More specifically, this op outputs a copy of
-    the input tensor where values from the depth dimension are moved in spatial blocks to the height
-    and width dimensions. By default, `mode` = `DCR`.
-    In the DCR mode, elements along the depth dimension from the input tensor are rearranged in the
-    following order: depth, column, and then row. The output y is computed from the input x as below:
+    DepthToSpace rearranges (permutes) data from depth into blocks of
+    spatial data. This is the reverse transformation of SpaceToDepth. More
+    specifically, this op outputs a copy of the input tensor where values
+    from the depth dimension are moved in spatial blocks to the height and
+    width dimensions. By default, ``mode`` = ``DCR``. In the DCR mode,
+    elements along the depth dimension from the input tensor are rearranged
+    in the following order: depth, column, and then row. The output y is
+    computed from the input x as below:
+
     b, c, h, w = x.shape
-    tmp = np.reshape(x, [b, blocksize, blocksize, c // (blocksize**2), h, w])
+
+    tmp = np.reshape(x, [b, blocksize, blocksize, c // (blocksize**2), h,
+    w])
+
     tmp = np.transpose(tmp, [0, 3, 4, 1, 5, 2])
-    y = np.reshape(tmp, [b, c // (blocksize**2), h * blocksize, w * blocksize])
-    In the CRD mode, elements along the depth dimension from the input tensor are rearranged in the
-    following order: column, row, and the depth. The output y is computed from the input x as below:
+
+    y = np.reshape(tmp, [b, c // (blocksize**2), h \* blocksize, w \*
+    blocksize])
+
+    In the CRD mode, elements along the depth dimension from the input
+    tensor are rearranged in the following order: column, row, and the
+    depth. The output y is computed from the input x as below:
+
     b, c, h, w = x.shape
-    tmp = np.reshape(x, [b, c // (blocksize ** 2), blocksize, blocksize, h, w])
+
+    tmp = np.reshape(x, [b, c // (blocksize \*\* 2), blocksize, blocksize,
+    h, w])
+
     tmp = np.transpose(tmp, [0, 1, 4, 2, 5, 3])
-    y = np.reshape(tmp, [b, c // (blocksize ** 2), h * blocksize, w * blocksize])
+
+    y = np.reshape(tmp, [b, c // (blocksize \*\* 2), h \* blocksize, w \*
+    blocksize])
 
     Parameters
     ==========
     input
         Type T.
-        Input tensor of [N,C,H,W], where N is the batch axis, C is the channel or depth, H is the height and W is the width.
+        Input tensor of [N,C,H,W], where N is the batch axis, C is the channel
+        or depth, H is the height and W is the width.
     blocksize
         Attribute.
         Blocks of [blocksize, blocksize] are moved.
     mode
         Attribute.
-        DCR (default) for depth-column-row order re-arrangement. Use CRD for column-row-depth order.
+        DCR (default) for depth-column-row order re-arrangement. Use CRD for
+        column-row-depth order.
 
     Returns
     =======
     output : Var
         Type T.
-        Output tensor of [N, C/(blocksize * blocksize), H * blocksize, W * blocksize].
+        Output tensor of [N, C/(blocksize \* blocksize), H \* blocksize, W \*
+        blocksize].
 
     Notes
     =====
@@ -5411,11 +5757,14 @@ def dequantize_linear(
     axis: int = 1,
 ) -> Var:
     r"""
-    The linear dequantization operator. It consumes a quantized tensor, a scale, and a zero point to compute the full precision tensor.
-    The dequantization formula is y = (x - x_zero_point) * x_scale. 'x_scale' and 'x_zero_point' must have same shape, and can be either a scalar
-    for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
-    'x_zero_point' and 'x' must have same type. 'x' and 'y' must have same shape. In the case of dequantizing int32,
-    there's no zero point (zero point is supposed to be 0).
+    The linear dequantization operator. It consumes a quantized tensor, a
+    scale, and a zero point to compute the full precision tensor. The
+    dequantization formula is y = (x - x_zero_point) \* x_scale. 'x_scale'
+    and 'x_zero_point' must have same shape, and can be either a scalar for
+    per-tensor / per layer quantization, or a 1-D tensor for per-axis
+    quantization. 'x_zero_point' and 'x' must have same type. 'x' and 'y'
+    must have same shape. In the case of dequantizing int32, there's no zero
+    point (zero point is supposed to be 0).
 
     Parameters
     ==========
@@ -5424,13 +5773,18 @@ def dequantize_linear(
         N-D quantized input tensor to be de-quantized.
     x_scale
         Type tensor(float).
-        Scale for input 'x'. It can be a scalar, which means a per-tensor/layer dequantization, or a 1-D tensor for per-axis dequantization.
+        Scale for input 'x'. It can be a scalar, which means a per-tensor/layer
+        dequantization, or a 1-D tensor for per-axis dequantization.
     x_zero_point
         Type T.
-        Zero point for input 'x'. Shape must match x_scale. It's optional. Zero point is 0 when it's not specified.
+        Zero point for input 'x'. Shape must match x_scale. It's optional. Zero
+        point is 0 when it's not specified.
     axis
         Attribute.
-        (Optional) The axis of the dequantizing dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        (Optional) The axis of the dequantizing dimension of the input tensor.
+        Ignored for per-tensor quantization. Negative value means counting
+        dimensions from the back. Accepted range is [-r, r-1] where r =
+        rank(input).
 
     Returns
     =======
@@ -5461,11 +5815,12 @@ def det(
     X: Var,
 ) -> Var:
     r"""
-    Det calculates determinant of a square matrix or batches of square matrices.
-    Det takes one input tensor of shape `[*, M, M]`, where `*` is zero or more batch dimensions,
-    and the inner-most 2 dimensions form square matrices.
-    The output is a tensor of shape `[*]`, containing the determinants of all input submatrices.
-    e.g., When the input is 2-D, the output is a scalar(shape is empty: `[]`).
+    Det calculates determinant of a square matrix or batches of square
+    matrices. Det takes one input tensor of shape ``[*, M, M]``, where ``*``
+    is zero or more batch dimensions, and the inner-most 2 dimensions form
+    square matrices. The output is a tensor of shape ``[*]``, containing the
+    determinants of all input submatrices. e.g., When the input is 2-D, the
+    output is a scalar(shape is empty: ``[]``).
 
     Parameters
     ==========
@@ -5499,9 +5854,15 @@ def div(
     B: Var,
 ) -> Var:
     r"""
-    Performs element-wise binary division (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
-    (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
+    Performs element-wise binary division (with Numpy-style broadcasting
+    support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+
+    (Opset 14 change): Extend supported types to include uint8, int8,
+    uint16, and int16.
 
     Parameters
     ==========
@@ -5542,18 +5903,31 @@ def dropout(
     seed: Optional[int] = None,
 ) -> Tuple[Var, Var]:
     r"""
-    Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
-    output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
-    Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
-    the user can simply not pass `training_mode` input or set it to false.
-    ```
-    output = scale * data * mask,
-    ```
+    Dropout takes an input floating-point tensor, an optional input ratio
+    (floating-point scalar) and an optional input training_mode (boolean
+    scalar). It produces two tensor outputs, output (floating-point tensor)
+    and mask (optional ``Tensor<bool>``). If ``training_mode`` is true then
+    the output Y will be a random dropout; Note that this Dropout scales the
+    masked input data by the following equation, so to convert the trained
+    model into inference mode, the user can simply not pass
+    ``training_mode`` input or set it to false.
+
+    ::
+
+       output = scale * data * mask,
+
     where
-    ```
-    scale = 1. / (1. - ratio).
-    ```
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+    ::
+
+       scale = 1. / (1. - ratio).
+
+    This operator has **optional** inputs/outputs. See `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for more
+    details about the representation of optional arguments. An empty string
+    may be used in the place of an actual argument's name to indicate a
+    missing argument. Trailing optional arguments (those not followed by an
+    argument that is present) may also be simply omitted.
 
     Parameters
     ==========
@@ -5562,13 +5936,22 @@ def dropout(
         The input data as Tensor.
     ratio
         Type T1.
-        The ratio of random dropout, with value in [0, 1). If this input was not set, or if it was set to 0, the output would be a simple copy of the input. If it's non-zero, output will be a random dropout of the scaled input, which is typically the case during training. It is an optional value, if not specified it will default to 0.5.
+        The ratio of random dropout, with value in [0, 1). If this input was not
+        set, or if it was set to 0, the output would be a simple copy of the
+        input. If it's non-zero, output will be a random dropout of the scaled
+        input, which is typically the case during training. It is an optional
+        value, if not specified it will default to 0.5.
     training_mode
         Type T2.
-        If set to true then it indicates dropout is being used for training. It is an optional value hence unless specified explicitly, it is false. If it is false, ratio is ignored and the operation mimics inference mode where nothing will be dropped from the input data and if mask is requested as output it will contain all ones.
+        If set to true then it indicates dropout is being used for training. It
+        is an optional value hence unless specified explicitly, it is false. If
+        it is false, ratio is ignored and the operation mimics inference mode
+        where nothing will be dropped from the input data and if mask is
+        requested as output it will contain all ones.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
 
     Returns
     =======
@@ -5604,28 +5987,33 @@ def dynamic_quantize_linear(
     x: Var,
 ) -> Tuple[Var, Var, Var]:
     r"""
-    A Function to fuse calculation for Scale, Zero Point and FP32->8Bit convertion of FP32 Input data.
-    Outputs Scale, ZeroPoint and Quantized Input for a given FP32 Input.
-    Scale is calculated as:
-    ```
-     y_scale = (max(x) - min(x))/(qmax - qmin)
-     * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
-     * data range is adjusted to include 0.
-    ```
+    A Function to fuse calculation for Scale, Zero Point and FP32->8Bit
+    convertion of FP32 Input data. Outputs Scale, ZeroPoint and Quantized
+    Input for a given FP32 Input. Scale is calculated as:
+
+    ::
+
+        y_scale = (max(x) - min(x))/(qmax - qmin)
+        * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
+        * data range is adjusted to include 0.
+
     Zero point is calculated as:
-    ```
-    intermediate_zero_point = qmin - min(x)/y_scale
-    y_zero_point = cast(round(saturate(itermediate_zero_point)))
-    * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
-    * for saturation, it saturates to [0, 255] if it's uint8, or [-127, 127] if it's int8. Right now only uint8 is supported.
-    * rounding to nearest ties to even.
-    ```
+
+    ::
+
+       intermediate_zero_point = qmin - min(x)/y_scale
+       y_zero_point = cast(round(saturate(itermediate_zero_point)))
+       * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
+       * for saturation, it saturates to [0, 255] if it's uint8, or [-127, 127] if it's int8. Right now only uint8 is supported.
+       * rounding to nearest ties to even.
+
     Data quantization formula is:
-    ```
-    y = saturate (round (x / y_scale) + y_zero_point)
-    * for saturation, it saturates to [0, 255] if it's uint8, or [-127, 127] if it's int8. Right now only uint8 is supported.
-    * rounding to nearest ties to even.
-    ```
+
+    ::
+
+       y = saturate (round (x / y_scale) + y_zero_point)
+       * for saturation, it saturates to [0, 255] if it's uint8, or [-127, 127] if it's int8. Right now only uint8 is supported.
+       * rounding to nearest ties to even.
 
     Parameters
     ==========
@@ -5640,10 +6028,12 @@ def dynamic_quantize_linear(
         Quantized output tensor
     y_scale : Var
         Type tensor(float).
-        Output scale. It's a scalar, which means a per-tensor/layer quantization.
+        Output scale. It's a scalar, which means a per-tensor/layer
+        quantization.
     y_zero_point : Var
         Type T2.
-        Output zero point. It's a scalar, which means a per-tensor/layer quantization.
+        Output zero point. It's a scalar, which means a per-tensor/layer
+        quantization.
 
     Notes
     =====
@@ -5667,23 +6057,37 @@ def einsum(
     equation: str,
 ) -> Var:
     r"""
-    An einsum of the form ```term1, term2 -> output-term``` produces an output tensor using the following equation
-    ```output[output-term] = reduce-sum( input1[term1] * input2[term] )```
-    where the reduce-sum performs a summation over all the indices occurring in the input terms (term1, term2)
-    that do not occur in the output-term.
-    The Einsum operator evaluates algebraic tensor operations on a sequence of tensors, using the Einstein summation
-    convention. The equation string contains a comma-separated sequence of lower case letters. Each term corresponds to
-    an operand tensor, and the characters within the terms correspond to operands dimensions.
-    This sequence may be followed by "->" to separate the left and right hand side of the equation.
-    If the equation contains "->" followed by the right-hand side, the explicit (not classical) form of the Einstein
-    summation is performed, and the right-hand side indices indicate output tensor dimensions. In other cases,
-    output indices are (implicitly) set to the alphabetically sorted sequence of indices appearing exactly once in the
-    equation.
-    When a dimension character is repeated in the left-hand side, it represents summation along the dimension.
-    The equation may contain ellipsis ("...") to enable broadcasting. Ellipsis must indicate a fixed number of dimensions.
-    Specifically, every occurrence of ellipsis in the equation must represent the same number of dimensions.
-    The right-hand side may contain exactly one ellipsis. In implicit mode, the ellipsis dimensions are set to the
-    beginning of the output. The equation string may contain space (U+0020) character.
+    An einsum of the form ``term1, term2 -> output-term`` produces an output
+    tensor using the following equation
+
+    ``output[output-term] = reduce-sum( input1[term1] * input2[term] )``
+
+    where the reduce-sum performs a summation over all the indices occurring
+    in the input terms (term1, term2) that do not occur in the output-term.
+
+    The Einsum operator evaluates algebraic tensor operations on a sequence
+    of tensors, using the Einstein summation convention. The equation string
+    contains a comma-separated sequence of lower case letters. Each term
+    corresponds to an operand tensor, and the characters within the terms
+    correspond to operands dimensions.
+
+    This sequence may be followed by "->" to separate the left and right
+    hand side of the equation. If the equation contains "->" followed by the
+    right-hand side, the explicit (not classical) form of the Einstein
+    summation is performed, and the right-hand side indices indicate output
+    tensor dimensions. In other cases, output indices are (implicitly) set
+    to the alphabetically sorted sequence of indices appearing exactly once
+    in the equation.
+
+    When a dimension character is repeated in the left-hand side, it
+    represents summation along the dimension.
+
+    The equation may contain ellipsis ("...") to enable broadcasting.
+    Ellipsis must indicate a fixed number of dimensions. Specifically, every
+    occurrence of ellipsis in the equation must represent the same number of
+    dimensions. The right-hand side may contain exactly one ellipsis. In
+    implicit mode, the ellipsis dimensions are set to the beginning of the
+    output. The equation string may contain space (U+0020) character.
 
     Parameters
     ==========
@@ -5724,8 +6128,9 @@ def elu(
 ) -> Var:
     r"""
     Elu takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
-    0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
+    (Tensor<T>) where the function
+    ``f(x) = alpha * (exp(x) - 1.) for x < 0``, ``f(x) = x for x >= 0``., is
+    applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -5764,9 +6169,13 @@ def equal(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `equal` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``equal`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -5816,7 +6225,8 @@ def erf(
     =======
     output : Var
         Type T.
-        The error function of the input tensor computed element-wise. It has the same shape and type of the input.
+        The error function of the input tensor computed element-wise. It has the
+        same shape and type of the input.
 
     Notes
     =====
@@ -5871,14 +6281,15 @@ def expand(
     shape: Var,
 ) -> Var:
     r"""
-    Broadcast the input tensor following the given shape and the broadcast rule.
-    The broadcast rule is similar to numpy.array(input) * numpy.ones(shape):
-    Dimensions are right alignment;
-    Two corresponding dimensions must have the same value, or one of them is equal to 1.
-    Also, this operator is similar to numpy.broadcast_to(input, shape),
-    but the major difference is numpy.broadcast_to() does not allow shape to be smaller than input.size().
-    It is possible that the output.shape is not equal to shape, when some dimensions in shape is equal to 1,
-    or the shape.ndim < input.shape.ndim.
+    Broadcast the input tensor following the given shape and the broadcast
+    rule. The broadcast rule is similar to numpy.array(input) \*
+    numpy.ones(shape): Dimensions are right alignment; Two corresponding
+    dimensions must have the same value, or one of them is equal to 1. Also,
+    this operator is similar to numpy.broadcast_to(input, shape), but the
+    major difference is numpy.broadcast_to() does not allow shape to be
+    smaller than input.size(). It is possible that the output.shape is not
+    equal to shape, when some dimensions in shape is equal to 1, or the
+    shape.ndim < input.shape.ndim.
 
     Parameters
     ==========
@@ -5887,7 +6298,8 @@ def expand(
         Input tensor
     shape
         Type tensor(int64).
-        A 1-D tensor indicates the shape you want to expand to, following the broadcast rule
+        A 1-D tensor indicates the shape you want to expand to, following the
+        broadcast rule
 
     Returns
     =======
@@ -5918,13 +6330,15 @@ def eye_like(
     k: int = 0,
 ) -> Var:
     r"""
-    Generate a 2D tensor (matrix) with ones on the diagonal and zeros everywhere else. Only 2D
-    tensors are supported, i.e. input T1 must be of rank 2. The shape of the output tensor is the
-    same as the input tensor. The data type can be specified by the 'dtype' argument. If
-    'dtype' is not specified, then the type of input tensor is used. By default, the main diagonal
-    is populated with ones, but attribute 'k' can be used to populate upper or lower diagonals.
-    The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
-    TensorProto message and be valid as an output type.
+    Generate a 2D tensor (matrix) with ones on the diagonal and zeros
+    everywhere else. Only 2D tensors are supported, i.e. input T1 must be of
+    rank 2. The shape of the output tensor is the same as the input tensor.
+    The data type can be specified by the 'dtype' argument. If 'dtype' is
+    not specified, then the type of input tensor is used. By default, the
+    main diagonal is populated with ones, but attribute 'k' can be used to
+    populate upper or lower diagonals. The 'dtype' argument must be one of
+    the data types specified in the 'DataType' enum field in the TensorProto
+    message and be valid as an output type.
 
     Parameters
     ==========
@@ -5933,10 +6347,15 @@ def eye_like(
         2D input tensor to copy shape, and optionally, type information from.
     dtype
         Attribute.
-        (Optional) The data type for the elements of the output tensor. If not specified,the data type of the input tensor T1 is used. If input tensor T1 is also notspecified, then type defaults to 'float'.
+        (Optional) The data type for the elements of the output tensor. If not
+        specified,the data type of the input tensor T1 is used. If input tensor
+        T1 is also notspecified, then type defaults to 'float'.
     k
         Attribute.
-        (Optional) Index of the diagonal to be populated with ones. Default is 0. If T2 is the output, this op sets T2[i, i+k] = 1. k = 0 populates the main diagonal, k > 0 populates an upper diagonal,  and k < 0 populates a lower diagonal.
+        (Optional) Index of the diagonal to be populated with ones. Default is
+        0. If T2 is the output, this op sets T2[i, i+k] = 1. k = 0 populates the
+        main diagonal, k > 0 populates an upper diagonal, and k < 0 populates a
+        lower diagonal.
 
     Returns
     =======
@@ -5970,8 +6389,8 @@ def flatten(
 ) -> Var:
     r"""
     Flattens the input tensor into a 2D matrix. If input tensor has shape
-    (d_0, d_1, ... d_n) then the output will have shape
-    (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
+    (d_0, d_1, ... d_n) then the output will have shape (d_0 X d_1 ...
+    d_(axis-1), d_axis X d_(axis+1) ... X dn).
 
     Parameters
     ==========
@@ -5980,13 +6399,20 @@ def flatten(
         A tensor of rank >= axis.
     axis
         Attribute.
-        Indicate up to which input dimensions (exclusive) should be flattened to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input tensor is (d_0, d_1, ... d_n).
+        Indicate up to which input dimensions (exclusive) should be flattened to
+        the outer dimension of the output. The value for axis must be in the
+        range [-r, r], where r is the rank of the input tensor. Negative value
+        means counting dimensions from the back. When axis = 0, the shape of the
+        output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input
+        tensor is (d_0, d_1, ... d_n).
 
     Returns
     =======
     output : Var
         Type T.
-        A 2D tensor with the contents of the input tensor, with input dimensions up to axis flattened to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output.
+        A 2D tensor with the contents of the input tensor, with input dimensions
+        up to axis flattened to the outer dimension of the output and remaining
+        input dimensions flattened into the inner dimension of the output.
 
     Notes
     =====
@@ -6010,8 +6436,8 @@ def floor(
 ) -> Var:
     r"""
     Floor takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the floor is, y = floor(x), is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the floor is, y = floor(x), is applied to the tensor
+    elementwise.
 
     Parameters
     ==========
@@ -6058,98 +6484,178 @@ def gru(
     linear_before_reset: int = 0,
 ) -> Tuple[Var, Var]:
     r"""
-    Computes an one-layer GRU. This operator is usually supported via some custom
-    implementation such as CuDNN.
+    Computes an one-layer GRU. This operator is usually supported via some
+    custom implementation such as CuDNN.
+
     Notations:
-    `X` - input tensor
-    `z` - update gate
-    `r` - reset gate
-    `h` - hidden gate
-    `t` - time step (t-1 means previous time step)
-    `W[zrh]` - W parameter weight matrix for update, reset, and hidden gates
-    `R[zrh]` - R recurrence weight matrix for update, reset, and hidden gates
-    `Wb[zrh]` - W bias vectors for update, reset, and hidden gates
-    `Rb[zrh]` - R bias vectors for update, reset, and hidden gates
-    `WB[zrh]` - W parameter weight matrix for backward update, reset, and hidden gates
-    `RB[zrh]` - R recurrence weight matrix for backward update, reset, and hidden gates
-    `WBb[zrh]` - W bias vectors for backward update, reset, and hidden gates
-    `RBb[zrh]` - R bias vectors for backward update, reset, and hidden gates
-    `H` - Hidden state
-    `num_directions` - 2 if direction == bidirectional else 1
+
+    ``X`` - input tensor
+
+    ``z`` - update gate
+
+    ``r`` - reset gate
+
+    ``h`` - hidden gate
+
+    ``t`` - time step (t-1 means previous time step)
+
+    ``W[zrh]`` - W parameter weight matrix for update, reset, and hidden
+    gates
+
+    ``R[zrh]`` - R recurrence weight matrix for update, reset, and hidden
+    gates
+
+    ``Wb[zrh]`` - W bias vectors for update, reset, and hidden gates
+
+    ``Rb[zrh]`` - R bias vectors for update, reset, and hidden gates
+
+    ``WB[zrh]`` - W parameter weight matrix for backward update, reset, and
+    hidden gates
+
+    ``RB[zrh]`` - R recurrence weight matrix for backward update, reset, and
+    hidden gates
+
+    ``WBb[zrh]`` - W bias vectors for backward update, reset, and hidden
+    gates
+
+    ``RBb[zrh]`` - R bias vectors for backward update, reset, and hidden
+    gates
+
+    ``H`` - Hidden state
+
+    ``num_directions`` - 2 if direction == bidirectional else 1
+
     Activation functions:
-      Relu(x)                - max(0, x)
-      Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
-      Sigmoid(x)             - 1/(1 + e^{-x})
-      (NOTE: Below are optional)
-      Affine(x)              - alpha*x + beta
-      LeakyRelu(x)           - x if x >= 0 else alpha * x
-      ThresholdedRelu(x)     - x if x >= alpha else 0
-      ScaledTanh(x)          - alpha*Tanh(beta*x)
-      HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
-      Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
-      Softsign(x)            - x/(1 + |x|)
-      Softplus(x)            - log(1 + e^x)
+
+    Relu(x) - max(0, x)
+
+    Tanh(x) - (1 - e^{-2x})/(1 + e^{-2x})
+
+    Sigmoid(x) - 1/(1 + e^{-x})
+
+    (NOTE: Below are optional)
+
+    Affine(x) - alpha*x + beta
+
+    LeakyRelu(x) - x if x >= 0 else alpha \* x
+
+    ThresholdedRelu(x) - x if x >= alpha else 0
+
+    ScaledTanh(x) - alpha\ *Tanh(beta*\ x)
+
+    HardSigmoid(x) - min(max(alpha*x + beta, 0), 1)
+
+    Elu(x) - x if x >= 0 else alpha*(e^x - 1)
+
+    Softsign(x) - x/(1 + \|x|)
+
+    Softplus(x) - log(1 + e^x)
+
     Equations (Default: f=Sigmoid, g=Tanh):
-      - zt = f(Xt*(Wz^T) + Ht-1*(Rz^T) + Wbz + Rbz)
-      - rt = f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
-      - ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh) # default, when linear_before_reset = 0
-      - ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh) # when linear_before_reset != 0
-      - Ht = (1 - zt) (.) ht + zt (.) Ht-1
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+    -  zt = f(Xt*(Wz^T) + Ht-1*(Rz^T) + Wbz + Rbz)
+
+    -  rt = f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
+
+    -  ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh) # default, when
+       linear_before_reset = 0
+
+    -  ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh) # when
+       linear_before_reset != 0
+
+    -  Ht = (1 - zt) (.) ht + zt (.) Ht-1 This operator has **optional**
+       inputs/outputs. See `the
+       doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for more
+       details about the representation of optional arguments. An empty
+       string may be used in the place of an actual argument's name to
+       indicate a missing argument. Trailing optional arguments (those not
+       followed by an argument that is present) may also be simply omitted.
 
     Parameters
     ==========
     X
         Type T.
-        The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.
+        The input sequences packed (and potentially padded) into one 3-D tensor
+        with the shape of ``[seq_length, batch_size, input_size]``.
     W
         Type T.
-        The weight tensor for the gates. Concatenation of `W[zrh]` and `WB[zrh]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 3*hidden_size, input_size]`.
+        The weight tensor for the gates. Concatenation of ``W[zrh]`` and
+        ``WB[zrh]`` (if bidirectional) along dimension 0. This tensor has shape
+        ``[num_directions, 3*hidden_size, input_size]``.
     R
         Type T.
-        The recurrence weight tensor. Concatenation of `R[zrh]` and `RB[zrh]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 3*hidden_size, hidden_size]`.
+        The recurrence weight tensor. Concatenation of ``R[zrh]`` and
+        ``RB[zrh]`` (if bidirectional) along dimension 0. This tensor has shape
+        ``[num_directions, 3*hidden_size, hidden_size]``.
     B
         Type T.
-        The bias tensor for the gates. Concatenation of `[Wb[zrh], Rb[zrh]]` and `[WBb[zrh], RBb[zrh]]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 6*hidden_size]`. Optional: If not specified - assumed to be 0
+        The bias tensor for the gates. Concatenation of ``[Wb[zrh], Rb[zrh]]``
+        and ``[WBb[zrh], RBb[zrh]]`` (if bidirectional) along dimension 0. This
+        tensor has shape ``[num_directions, 6*hidden_size]``. Optional: If not
+        specified - assumed to be 0
     sequence_lens
         Type T1.
-        Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.
+        Optional tensor specifying lengths of the sequences in a batch. If not
+        specified - assumed all sequences in the batch to have length
+        ``seq_length``. It has shape ``[batch_size]``.
     initial_h
         Type T.
-        Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.
+        Optional initial value of the hidden. If not specified - assumed to be
+        0. It has shape ``[num_directions, batch_size, hidden_size]``.
     activation_alpha
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX
+        operators.For example with LeakyRelu, the default alpha is 0.01.
     activation_beta
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX operators.
     activations
         Attribute.
-        A list of 2 (or 4 if bidirectional) activation functions for update, reset, and hidden gates. The activation functions must be one of the activation functions specified above. Optional: See the equations for default if not specified.
+        A list of 2 (or 4 if bidirectional) activation functions for update,
+        reset, and hidden gates. The activation functions must be one of the
+        activation functions specified above. Optional: See the equations for
+        default if not specified.
     clip
         Attribute.
-        Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.
+        Cell clip threshold. Clipping bounds the elements of a tensor in the
+        range of [-threshold, +threshold] and is applied to the input of
+        activations. No clip if not specified.
     direction
         Attribute.
-        Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.
+        Specify if the RNN is forward, reverse, or bidirectional. Must be one of
+        forward (default), reverse, or bidirectional.
     hidden_size
         Attribute.
         Number of neurons in the hidden layer
     layout
         Attribute.
-        The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size, num_directions, hidden_size].
+        The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the
+        following shapes are expected: X.shape = [seq_length, batch_size,
+        input_size], Y.shape = [seq_length, num_directions, batch_size,
+        hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size,
+        hidden_size]. If 1, the following shapes are expected: X.shape =
+        [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length,
+        num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size,
+        num_directions, hidden_size].
     linear_before_reset
         Attribute.
-        When computing the output of the hidden gate, apply the linear transformation before multiplying by the output of the reset gate.
+        When computing the output of the hidden gate, apply the linear
+        transformation before multiplying by the output of the reset gate.
 
     Returns
     =======
     Y : Var
         Type T.
-        A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`.
+        A tensor that concats all the intermediate output values of the hidden.
+        It has shape ``[seq_length, num_directions, batch_size, hidden_size]``.
     Y_h : Var
         Type T.
-        The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.
+        The last output value of the hidden. It has shape
+        ``[num_directions, batch_size, hidden_size]``.
 
     Notes
     =====
@@ -6192,56 +6698,59 @@ def gather(
     axis: int = 0,
 ) -> Var:
     r"""
-    Given `data` tensor of rank r >= 1, and `indices` tensor of rank q, gather
-    entries of the axis dimension of `data` (by default outer-most one as axis=0) indexed by `indices`, and concatenates
-    them in an output tensor of rank q + (r - 1).
+    Given ``data`` tensor of rank r >= 1, and ``indices`` tensor of rank q,
+    gather entries of the axis dimension of ``data`` (by default outer-most
+    one as axis=0) indexed by ``indices``, and concatenates them in an
+    output tensor of rank q + (r - 1).
+
     axis = 0 :
-    Let
-    k = indices[i_{0}, ..., i_{q-1}]
-    Then
-    output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2}] = input[k , j_{0}, ..., j_{r-2}]
-    ```
-      data = [
-          [1.0, 1.2],
-          [2.3, 3.4],
-          [4.5, 5.7],
-      ]
-      indices = [
-          [0, 1],
-          [1, 2],
-      ]
-      output = [
-          [
-              [1.0, 1.2],
-              [2.3, 3.4],
-          ],
-          [
-              [2.3, 3.4],
-              [4.5, 5.7],
-          ],
-      ]
-    ```
+
+    Let k = indices[i_{0}, ..., i_{q-1}] Then output[i_{0}, ..., i_{q-1},
+    j_{0}, ..., j_{r-2}] = input[k , j_{0}, ..., j_{r-2}]
+
+    ::
+
+         data = [
+             [1.0, 1.2],
+             [2.3, 3.4],
+             [4.5, 5.7],
+         ]
+         indices = [
+             [0, 1],
+             [1, 2],
+         ]
+         output = [
+             [
+                 [1.0, 1.2],
+                 [2.3, 3.4],
+             ],
+             [
+                 [2.3, 3.4],
+                 [4.5, 5.7],
+             ],
+         ]
+
     axis = 1 :
-    Let
-    k = indices[i_{0}, ..., i_{q-1}]
-    Then
-    output[j_{0}, i_{0}, ..., i_{q-1}, j_{1}, ..., j_{r-2}] = input[j_{0}, k, j_{1}, ..., j_{r-2}]
-    ```
-      data = [
-          [1.0, 1.2, 1.9],
-          [2.3, 3.4, 3.9],
-          [4.5, 5.7, 5.9],
-      ]
-      indices = [
-          [0, 2],
-      ]
-      axis = 1,
-      output = [
-              [[1.0, 1.9]],
-              [[2.3, 3.9]],
-              [[4.5, 5.9]],
-      ]
-    ```
+
+    Let k = indices[i_{0}, ..., i_{q-1}] Then output[j_{0}, i_{0}, ...,
+    i_{q-1}, j_{1}, ..., j_{r-2}] = input[j_{0}, k, j_{1}, ..., j_{r-2}]
+
+    ::
+
+         data = [
+             [1.0, 1.2, 1.9],
+             [2.3, 3.4, 3.9],
+             [4.5, 5.7, 5.9],
+         ]
+         indices = [
+             [0, 2],
+         ]
+         axis = 1,
+         output = [
+                 [[1.0, 1.9]],
+                 [[2.3, 3.9]],
+                 [[4.5, 5.9]],
+         ]
 
     Parameters
     ==========
@@ -6250,10 +6759,13 @@ def gather(
         Tensor of rank r >= 1.
     indices
         Type Tind.
-        Tensor of int32/int64 indices, of any rank q. All index values are expected to be within bounds [-s, s-1] along axis of size s. It is an error if any of the index values are out of bounds.
+        Tensor of int32/int64 indices, of any rank q. All index values are
+        expected to be within bounds [-s, s-1] along axis of size s. It is an
+        error if any of the index values are out of bounds.
     axis
         Attribute.
-        Which axis to gather on. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).
+        Which axis to gather on. Negative value means counting dimensions from
+        the back. Accepted range is [-r, r-1] where r = rank(data).
 
     Returns
     =======
@@ -6287,54 +6799,63 @@ def gather_elements(
     axis: int = 0,
 ) -> Var:
     r"""
-    GatherElements takes two inputs `data` and `indices` of the same rank r >= 1
-    and an optional attribute `axis` that identifies an axis of `data`
-    (by default, the outer-most axis, that is axis 0). It is an indexing operation
-    that produces its output by indexing into the input data tensor at index
-    positions determined by elements of the `indices` tensor.
-    Its output shape is the same as the shape of `indices` and consists of one value
-    (gathered from the `data`) for each element in `indices`.
+    GatherElements takes two inputs ``data`` and ``indices`` of the same
+    rank r >= 1 and an optional attribute ``axis`` that identifies an axis
+    of ``data`` (by default, the outer-most axis, that is axis 0). It is an
+    indexing operation that produces its output by indexing into the input
+    data tensor at index positions determined by elements of the ``indices``
+    tensor. Its output shape is the same as the shape of ``indices`` and
+    consists of one value (gathered from the ``data``) for each element in
+    ``indices``.
+
     For instance, in the 3-D case (r = 3), the output produced is determined
     by the following equations:
-    ```
-      out[i][j][k] = input[index[i][j][k]][j][k] if axis = 0,
-      out[i][j][k] = input[i][index[i][j][k]][k] if axis = 1,
-      out[i][j][k] = input[i][j][index[i][j][k]] if axis = 2,
-    ```
-    This operator is also the inverse of ScatterElements. It is similar to Torch's gather operation.
+
+    ::
+
+         out[i][j][k] = input[index[i][j][k]][j][k] if axis = 0,
+         out[i][j][k] = input[i][index[i][j][k]][k] if axis = 1,
+         out[i][j][k] = input[i][j][index[i][j][k]] if axis = 2,
+
+    This operator is also the inverse of ScatterElements. It is similar to
+    Torch's gather operation.
+
     Example 1:
-    ```
-      data = [
-          [1, 2],
-          [3, 4],
-      ]
-      indices = [
-          [0, 0],
-          [1, 0],
-      ]
-      axis = 1
-      output = [
-          [1, 1],
-          [4, 3],
-      ]
-    ```
+
+    ::
+
+         data = [
+             [1, 2],
+             [3, 4],
+         ]
+         indices = [
+             [0, 0],
+             [1, 0],
+         ]
+         axis = 1
+         output = [
+             [1, 1],
+             [4, 3],
+         ]
+
     Example 2:
-    ```
-      data = [
-          [1, 2, 3],
-          [4, 5, 6],
-          [7, 8, 9],
-      ]
-      indices = [
-          [1, 2, 0],
-          [2, 0, 0],
-      ]
-      axis = 0
-      output = [
-          [4, 8, 3],
-          [7, 2, 3],
-      ]
-    ```
+
+    ::
+
+         data = [
+             [1, 2, 3],
+             [4, 5, 6],
+             [7, 8, 9],
+         ]
+         indices = [
+             [1, 2, 0],
+             [2, 0, 0],
+         ]
+         axis = 0
+         output = [
+             [4, 8, 3],
+             [7, 2, 3],
+         ]
 
     Parameters
     ==========
@@ -6343,10 +6864,13 @@ def gather_elements(
         Tensor of rank r >= 1.
     indices
         Type Tind.
-        Tensor of int32/int64 indices, with the same rank r as the input. All index values are expected to be within bounds [-s, s-1] along axis of size s. It is an error if any of the index values are out of bounds.
+        Tensor of int32/int64 indices, with the same rank r as the input. All
+        index values are expected to be within bounds [-s, s-1] along axis of
+        size s. It is an error if any of the index values are out of bounds.
     axis
         Attribute.
-        Which axis to gather on. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).
+        Which axis to gather on. Negative value means counting dimensions from
+        the back. Accepted range is [-r, r-1] where r = rank(data).
 
     Returns
     =======
@@ -6380,56 +6904,114 @@ def gather_nd(
     batch_dims: int = 0,
 ) -> Var:
     r"""
-    Given `data` tensor of rank `r` >= 1, `indices` tensor of rank `q` >= 1, and `batch_dims` integer `b`, this operator gathers
-    slices of `data` into an output tensor of rank `q + r - indices_shape[-1] - 1 - b`.
-    `indices` is an q-dimensional integer tensor, best thought of as a `(q-1)`-dimensional tensor of index-tuples into `data`,
-    where each element defines a slice of `data`
-    `batch_dims` (denoted as `b`) is an integer indicating the number of batch dimensions, i.e the leading `b` number of dimensions of
-    `data` tensor and `indices` are representing the batches, and the gather starts from the `b+1` dimension.
+    Given ``data`` tensor of rank ``r`` >= 1, ``indices`` tensor of rank
+    ``q`` >= 1, and ``batch_dims`` integer ``b``, this operator gathers
+    slices of ``data`` into an output tensor of rank
+    ``q + r - indices_shape[-1] - 1 - b``.
+
+    ``indices`` is an q-dimensional integer tensor, best thought of as a
+    ``(q-1)``-dimensional tensor of index-tuples into ``data``, where each
+    element defines a slice of ``data``
+
+    ``batch_dims`` (denoted as ``b``) is an integer indicating the number of
+    batch dimensions, i.e the leading ``b`` number of dimensions of ``data``
+    tensor and ``indices`` are representing the batches, and the gather
+    starts from the ``b+1`` dimension.
+
     Some salient points about the inputs' rank and shape:
-    1) r >= 1 and q >= 1 are to be honored. There is no dependency condition to be met between ranks `r` and `q`
-    2) The first `b` dimensions of the shape of `indices` tensor and `data` tensor must be equal.
+
+    1) r >= 1 and q >= 1 are to be honored. There is no dependency condition
+       to be met between ranks ``r`` and ``q``
+
+    2) The first ``b`` dimensions of the shape of ``indices`` tensor and
+       ``data`` tensor must be equal.
+
     3) b < min(q, r) is to be honored.
-    4) The `indices_shape[-1]` should have a value between 1 (inclusive) and rank `r-b` (inclusive)
-    5) All values in `indices` are expected to be within bounds [-s, s-1] along axis of size `s` (i.e.) `-data_shape[i] <= indices[...,i] <= data_shape[i] - 1`.
-       It is an error if any of the index values are out of bounds.
+
+    4) The ``indices_shape[-1]`` should have a value between 1 (inclusive)
+       and rank ``r-b`` (inclusive)
+
+    5) All values in ``indices`` are expected to be within bounds [-s, s-1]
+       along axis of size ``s`` (i.e.)
+       ``-data_shape[i] <= indices[...,i] <= data_shape[i] - 1``. It is an
+       error if any of the index values are out of bounds.
+
     The output is computed as follows:
-    The output tensor is obtained by mapping each index-tuple in the `indices` tensor to the corresponding slice of the input `data`.
-    1) If `indices_shape[-1] > r-b` => error condition
-    2) If `indices_shape[-1] == r-b`, since the rank of `indices` is `q`, `indices` can be thought of as `N` `(q-b-1)`-dimensional tensors
-       containing 1-D tensors of dimension `r-b`, where `N` is an integer equals to the product of 1 and all the elements in the batch dimensions
-       of the indices_shape. Let us think of each such `r-b` ranked tensor as `indices_slice`. Each *scalar value* corresponding to `data[0:b-1,indices_slice]`
-       is filled into the corresponding location of the `(q-b-1)`-dimensional tensor to form the `output` tensor (Example 1 below)
-    3) If `indices_shape[-1] < r-b`, since the rank of `indices` is `q`, `indices` can be thought of as `N` `(q-b-1)`-dimensional tensor
-       containing 1-D tensors of dimension `< r-b`. Let us think of each such tensors as `indices_slice`. Each *tensor slice* corresponding
-       to `data[0:b-1, indices_slice , :]` is filled into the corresponding location of the `(q-b-1)`-dimensional tensor
-       to form the `output` tensor (Examples 2, 3, 4 and 5 below)
-    This operator is the inverse of `ScatterND`.
-    `Example 1`
-      batch_dims = 0
-      data    = [[0,1],[2,3]]   # data_shape = [2, 2]
-      indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
-      output  = [0,3]           # output_shape = [2]
-    `Example 2`
-      batch_dims = 0
-      data    = [[0,1],[2,3]]  # data_shape = [2, 2]
-      indices = [[1],[0]]      # indices_shape = [2, 1]
-      output  = [[2,3],[0,1]]  # output_shape = [2, 2]
-    `Example 3`
-      batch_dims = 0
-      data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
-      indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
-      output  = [[2,3],[4,5]]                 # output_shape = [2, 2]
-    `Example 4`
-      batch_dims = 0
-      data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
-      indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
-      output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2]
-    `Example 5`
-      batch_dims = 1
-      data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
-      indices = [[1],[0]]             # indices_shape = [2, 1]
-      output  = [[2,3],[4,5]]             # output_shape = [2, 2]
+
+    The output tensor is obtained by mapping each index-tuple in the
+    ``indices`` tensor to the corresponding slice of the input ``data``.
+
+    1) If ``indices_shape[-1] > r-b`` => error condition
+
+    2) If ``indices_shape[-1] == r-b``, since the rank of ``indices`` is
+       ``q``, ``indices`` can be thought of as ``N`` ``(q-b-1)``-dimensional
+       tensors containing 1-D tensors of dimension ``r-b``, where ``N`` is
+       an integer equals to the product of 1 and all the elements in the
+       batch dimensions of the indices_shape. Let us think of each such
+       ``r-b`` ranked tensor as ``indices_slice``. Each *scalar value*
+       corresponding to ``data[0:b-1,indices_slice]`` is filled into the
+       corresponding location of the ``(q-b-1)``-dimensional tensor to form
+       the ``output`` tensor (Example 1 below)
+
+    3) If ``indices_shape[-1] < r-b``, since the rank of ``indices`` is
+       ``q``, ``indices`` can be thought of as ``N`` ``(q-b-1)``-dimensional
+       tensor containing 1-D tensors of dimension ``< r-b``. Let us think of
+       each such tensors as ``indices_slice``. Each *tensor slice*
+       corresponding to ``data[0:b-1, indices_slice , :]`` is filled into
+       the corresponding location of the ``(q-b-1)``-dimensional tensor to
+       form the ``output`` tensor (Examples 2, 3, 4 and 5 below)
+
+    This operator is the inverse of ``ScatterND``.
+
+    ``Example 1``
+
+    batch_dims = 0
+
+    data = [[0,1],[2,3]] # data_shape = [2, 2]
+
+    indices = [[0,0],[1,1]] # indices_shape = [2, 2]
+
+    output = [0,3] # output_shape = [2]
+
+    ``Example 2``
+
+    batch_dims = 0
+
+    data = [[0,1],[2,3]] # data_shape = [2, 2]
+
+    indices = [[1],[0]] # indices_shape = [2, 1]
+
+    output = [[2,3],[0,1]] # output_shape = [2, 2]
+
+    ``Example 3``
+
+    batch_dims = 0
+
+    data = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[0,1],[1,0]] # indices_shape = [2, 2]
+
+    output = [[2,3],[4,5]] # output_shape = [2, 2]
+
+    ``Example 4``
+
+    batch_dims = 0
+
+    data = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[[0,1]],[[1,0]]] # indices_shape = [2, 1, 2]
+
+    output = [[[2,3]],[[4,5]]] # output_shape = [2, 1, 2]
+
+    ``Example 5``
+
+    batch_dims = 1
+
+    data = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+
+    indices = [[1],[0]] # indices_shape = [2, 1]
+
+    output = [[2,3],[4,5]] # output_shape = [2, 2]
 
     Parameters
     ==========
@@ -6438,10 +7020,13 @@ def gather_nd(
         Tensor of rank r >= 1.
     indices
         Type tensor(int64).
-        Tensor of rank q >= 1. All index values are expected to be within bounds [-s, s-1] along axis of size s. It is an error if any of the index values are out of bounds.
+        Tensor of rank q >= 1. All index values are expected to be within bounds
+        [-s, s-1] along axis of size s. It is an error if any of the index
+        values are out of bounds.
     batch_dims
         Attribute.
-        The number of batch dimensions. The gather of indexing starts from dimension of data[batch_dims:]
+        The number of batch dimensions. The gather of indexing starts from
+        dimension of data[batch_dims:]
 
     Returns
     =======
@@ -6480,29 +7065,44 @@ def gemm(
     r"""
     General Matrix multiplication:
     https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3
+
     A' = transpose(A) if transA else A
+
     B' = transpose(B) if transB else B
-    Compute Y = alpha * A' * B' + beta * C, where input tensor A has shape (M, K) or (K, M),
-    input tensor B has shape (K, N) or (N, K), input tensor C is broadcastable to shape (M, N),
-    and output tensor Y has shape (M, N). A will be transposed before doing the
-    computation if attribute transA is non-zero, same for B and transB.
-    This operator supports **unidirectional broadcasting** (tensor C should be unidirectional broadcastable to tensor A * B); for more details please check the doc (Broadcasting.md).
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+    Compute Y = alpha \* A' \* B' + beta \* C, where input tensor A has
+    shape (M, K) or (K, M), input tensor B has shape (K, N) or (N, K), input
+    tensor C is broadcastable to shape (M, N), and output tensor Y has shape
+    (M, N). A will be transposed before doing the computation if attribute
+    transA is non-zero, same for B and transB. This operator supports
+    **unidirectional broadcasting** (tensor C should be unidirectional
+    broadcastable to tensor A \* B); for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+    This operator has **optional** inputs/outputs. See `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for more
+    details about the representation of optional arguments. An empty string
+    may be used in the place of an actual argument's name to indicate a
+    missing argument. Trailing optional arguments (those not followed by an
+    argument that is present) may also be simply omitted.
 
     Parameters
     ==========
     A
         Type T.
-        Input tensor A. The shape of A should be (M, K) if transA is 0, or (K, M) if transA is non-zero.
+        Input tensor A. The shape of A should be (M, K) if transA is 0, or (K,
+        M) if transA is non-zero.
     B
         Type T.
-        Input tensor B. The shape of B should be (K, N) if transB is 0, or (N, K) if transB is non-zero.
+        Input tensor B. The shape of B should be (K, N) if transB is 0, or (N,
+        K) if transB is non-zero.
     C
         Type T.
-        Optional input tensor C. If not specified, the computation is done as if C is a scalar 0. The shape of C should be unidirectional broadcastable to (M, N).
+        Optional input tensor C. If not specified, the computation is done as if
+        C is a scalar 0. The shape of C should be unidirectional broadcastable
+        to (M, N).
     alpha
         Attribute.
-        Scalar multiplier for the product of input tensors A * B.
+        Scalar multiplier for the product of input tensors A \* B.
     beta
         Attribute.
         Scalar multiplier for input tensor C.
@@ -6545,21 +7145,28 @@ def global_average_pool(
     X: Var,
 ) -> Var:
     r"""
-    GlobalAveragePool consumes an input tensor X and applies average pooling across
-     the values in the same channel. This is equivalent to AveragePool with kernel size
-     equal to the spatial dimension of input tensor.
+    GlobalAveragePool consumes an input tensor X and applies average pooling
+    across the values in the same channel. This is equivalent to AveragePool
+    with kernel size equal to the spatial dimension of input tensor.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.
+        Output data tensor from pooling across the input tensor. The output
+        tensor has the same rank as the input. The first two dimensions of
+        output shape are the same as the input (N x C), while the other
+        dimensions are all 1.
 
     Notes
     =====
@@ -6582,15 +7189,19 @@ def global_lp_pool(
     p: int = 2,
 ) -> Var:
     r"""
-    GlobalLpPool consumes an input tensor X and applies lp pool pooling across
-     the values in the same channel. This is equivalent to LpPool with kernel size
-     equal to the spatial dimension of input tensor.
+    GlobalLpPool consumes an input tensor X and applies lp pool pooling
+    across the values in the same channel. This is equivalent to LpPool with
+    kernel size equal to the spatial dimension of input tensor.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size.
     p
         Attribute.
         p value of the Lp norm used to pool over the input data.
@@ -6599,7 +7210,10 @@ def global_lp_pool(
     =======
     Y : Var
         Type T.
-        Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.
+        Output data tensor from pooling across the input tensor. The output
+        tensor has the same rank as the input. The first two dimensions of
+        output shape are the same as the input (N x C), while the other
+        dimensions are all 1.
 
     Notes
     =====
@@ -6623,20 +7237,27 @@ def global_max_pool(
 ) -> Var:
     r"""
     GlobalMaxPool consumes an input tensor X and applies max pooling across
-     the values in the same channel. This is equivalent to MaxPool with kernel size
-     equal to the spatial dimension of input tensor.
+    the values in the same channel. This is equivalent to MaxPool with
+    kernel size equal to the spatial dimension of input tensor.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.
+        Output data tensor from pooling across the input tensor. The output
+        tensor has the same rank as the input. The first two dimensions of
+        output shape are the same as the input (N x C), while the other
+        dimensions are all 1.
 
     Notes
     =====
@@ -6658,9 +7279,13 @@ def greater(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `greater` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``greater`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -6699,9 +7324,13 @@ def greater_or_equal(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `greater_equal` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``greater_equal``
+    logical operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -6744,40 +7373,70 @@ def grid_sample(
     padding_mode: str = "zeros",
 ) -> Var:
     r"""
-    Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from `grid`.
-    Currently, only spatial (4-D) inputs are supported. For input `X` with shape (N, C, H, W) and `grid` with shape (N, H_out, W_out, 2),
-    the output `Y` will have shape (N, C, H_out, W_out).
-    The tensor `X` contains values at centers of square pixels in a H by W 2-dimensional image.
-    The tensor `grid` describes normalized positions where the output `Y` is to be computed
-    using a specified interpolation method (the mode) and a padding mode (for grid positions falling outside the 2-dimensional image).
-    Elements in `grid[N, H_out, W_out]` are size-2 vectors specifying positions in the 2-dimensional space of `X`.
-    They are used to interpolate output values of `Y[N, C, H_out, W_out]`.
-    The GridSample operator is often used in doing grid generator and sampler in the Spatial Transformer Networks (https://arxiv.org/abs/1506.02025).
-    See also in torch.nn.functional.grid_sample (https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample).
+    Given an input ``X`` and a flow-field ``grid``, computes the output
+    ``Y`` using ``X`` values and pixel locations from ``grid``. Currently,
+    only spatial (4-D) inputs are supported. For input ``X`` with shape (N,
+    C, H, W) and ``grid`` with shape (N, H_out, W_out, 2), the output ``Y``
+    will have shape (N, C, H_out, W_out).
+
+    The tensor ``X`` contains values at centers of square pixels in a H by W
+    2-dimensional image. The tensor ``grid`` describes normalized positions
+    where the output ``Y`` is to be computed using a specified interpolation
+    method (the mode) and a padding mode (for grid positions falling outside
+    the 2-dimensional image).
+
+    Elements in ``grid[N, H_out, W_out]`` are size-2 vectors specifying
+    positions in the 2-dimensional space of ``X``. They are used to
+    interpolate output values of ``Y[N, C, H_out, W_out]``.
+
+    The GridSample operator is often used in doing grid generator and
+    sampler in the `Spatial Transformer
+    Networks <https://arxiv.org/abs/1506.02025>`__. See also in
+    `torch.nn.functional.grid_sample <https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample>`__.
 
     Parameters
     ==========
     X
         Type T1.
-        4-D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the input data.
+        4-D tensor of shape (N, C, H, W), where N is the batch size, C is the
+        numbers of channels, H and W are the height and width of the input data.
     grid
         Type T2.
-        Input offset, 4-D tensor of shape (N, H_out, W_out, 2), where H_out and W_out are the height and width of grid and output, Grid specifies the sampling pixel locations normalized by the input spatial dimensions. Therefore, it should have most values in the range of [-1, 1]. If grid has values outside the range of [-1, 1], the corresponding outputs will be handled as defined by padding_mode.
+        Input offset, 4-D tensor of shape (N, H_out, W_out, 2), where H_out and
+        W_out are the height and width of grid and output, Grid specifies the
+        sampling pixel locations normalized by the input spatial dimensions.
+        Therefore, it should have most values in the range of [-1, 1]. If grid
+        has values outside the range of [-1, 1], the corresponding outputs will
+        be handled as defined by padding_mode.
     align_corners
         Attribute.
-        If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels. If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic.
+        If align_corners=1, the extrema (-1 and 1) are considered as referring
+        to the center points of the input's corner pixels. If align_corners=0,
+        they are instead considered as referring to the corner points of the
+        input's corner pixels, making the sampling more resolution agnostic.
     mode
         Attribute.
         Three interpolation modes: bilinear (default), nearest and bicubic.
     padding_mode
         Attribute.
-        Support padding modes for outside grid values: `zeros`(default), `border`, `reflection`. zeros: use 0 for out-of-bound grid locations, border: use border values for out-of-bound grid locations, reflection: use values at locations reflected by the border for out-of-bound grid locations. If index 0 represents the margin pixel, the reflected value at index -1 will be the same as the value at index 1. For location far away from the border, it will keep being reflected until becoming in bound. If pixel location x = -3.5 reflects by border -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' = 0.5.
+        Support padding modes for outside grid values: ``zeros``\ (default),
+        ``border``, ``reflection``. zeros: use 0 for out-of-bound grid
+        locations, border: use border values for out-of-bound grid locations,
+        reflection: use values at locations reflected by the border for
+        out-of-bound grid locations. If index 0 represents the margin pixel, the
+        reflected value at index -1 will be the same as the value at index 1.
+        For location far away from the border, it will keep being reflected
+        until becoming in bound. If pixel location x = -3.5 reflects by border
+        -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' =
+        0.5.
 
     Returns
     =======
     Y : Var
         Type T1.
-        4-D tensor of shape (N, C, H_out, W_out) of sampled values. For integer input types, intermediate values are computed as floating point and cast to integer at the end.
+        4-D tensor of shape (N, C, H_out, W_out) of sampled values. For integer
+        input types, intermediate values are computed as floating point and cast
+        to integer at the end.
 
     Notes
     =====
@@ -6807,7 +7466,8 @@ def hamming_window(
     periodic: int = 1,
 ) -> Var:
     r"""
-    Generates a Hamming window as described in the paper https://ieeexplore.ieee.org/document/1455106.
+    Generates a Hamming window as described in the paper
+    https://ieeexplore.ieee.org/document/1455106.
 
     Parameters
     ==========
@@ -6816,10 +7476,15 @@ def hamming_window(
         A scalar value indicating the length of the window.
     output_datatype
         Attribute.
-        The data type of the output tensor. Strictly must be one of the values from DataType enum in TensorProto whose values correspond to T2. The default value is 1 = FLOAT.
+        The data type of the output tensor. Strictly must be one of the values
+        from DataType enum in TensorProto whose values correspond to T2. The
+        default value is 1 = FLOAT.
     periodic
         Attribute.
-        If 1, returns a window to be used as periodic function. If 0, return a symmetric window. When 'periodic' is specified, hann computes a window of length size + 1 and returns the first size points. The default value is 1.
+        If 1, returns a window to be used as periodic function. If 0, return a
+        symmetric window. When 'periodic' is specified, hann computes a window
+        of length size + 1 and returns the first size points. The default value
+        is 1.
 
     Returns
     =======
@@ -6853,7 +7518,8 @@ def hann_window(
     periodic: int = 1,
 ) -> Var:
     r"""
-    Generates a Hann window as described in the paper https://ieeexplore.ieee.org/document/1455106.
+    Generates a Hann window as described in the paper
+    https://ieeexplore.ieee.org/document/1455106.
 
     Parameters
     ==========
@@ -6862,10 +7528,15 @@ def hann_window(
         A scalar value indicating the length of the window.
     output_datatype
         Attribute.
-        The data type of the output tensor. Strictly must be one of the values from DataType enum in TensorProto whose values correspond to T2. The default value is 1 = FLOAT.
+        The data type of the output tensor. Strictly must be one of the values
+        from DataType enum in TensorProto whose values correspond to T2. The
+        default value is 1 = FLOAT.
     periodic
         Attribute.
-        If 1, returns a window to be used as periodic function. If 0, return a symmetric window. When 'periodic' is specified, hann computes a window of length size + 1 and returns the first size points. The default value is 1.
+        If 1, returns a window to be used as periodic function. If 0, return a
+        symmetric window. When 'periodic' is specified, hann computes a window
+        of length size + 1 and returns the first size points. The default value
+        is 1.
 
     Returns
     =======
@@ -6899,9 +7570,9 @@ def hard_sigmoid(
     beta: float = 0.5,
 ) -> Var:
     r"""
-    HardSigmoid takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the HardSigmoid function, y = max(0, min(1, alpha * x + beta)),
-    is applied to the tensor elementwise.
+    HardSigmoid takes one input data (Tensor<T>) and produces one output
+    data (Tensor<T>) where the HardSigmoid function, y = max(0, min(1, alpha
+    \* x + beta)), is applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -6943,9 +7614,10 @@ def hard_swish(
     X: Var,
 ) -> Var:
     r"""
-    HardSwish takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where
-    the HardSwish function, y = x * max(0, min(1, alpha * x + beta)) = x * HardSigmoid<alpha, beta>(x),
-    where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
+    HardSwish takes one input data (Tensor<T>) and produces one output data
+    (Tensor<T>) where the HardSwish function, y = x \* max(0, min(1, alpha
+    \* x + beta)) = x \* HardSigmoid<alpha, beta>(x), where alpha = 1/6 and
+    beta = 0.5, is applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -6981,10 +7653,13 @@ def hardmax(
 ) -> Var:
     r"""
     The operator computes the hardmax values for the given input:
-     Hardmax(element in input, axis) = 1 if the element is the first maximum value along the specified axis, 0 otherwise
-    The "axis" attribute indicates the dimension along which Hardmax
-    will be performed. The output tensor has the same shape
-    and contains the Hardmax values of the corresponding input.
+
+    Hardmax(element in input, axis) = 1 if the element is the first maximum
+    value along the specified axis, 0 otherwise
+
+    The "axis" attribute indicates the dimension along which Hardmax will be
+    performed. The output tensor has the same shape and contains the Hardmax
+    values of the corresponding input.
 
     Parameters
     ==========
@@ -6993,9 +7668,9 @@ def hardmax(
         The input tensor of rank >= axis.
     axis
         Attribute.
-        Describes the dimension Hardmax will be performed on.
-        Negative value means counting dimensions
-        from the back. Accepted range is [-r, r-1] where r = rank(input).
+        Describes the dimension Hardmax will be performed on. Negative value
+        means counting dimensions from the back. Accepted range is [-r, r-1]
+        where r = rank(input).
 
     Returns
     =======
@@ -7069,16 +7744,33 @@ def if_(
         Condition for the if
     else_branch
         Attribute.
-        Graph to run if condition is false. Has N outputs: values you wish to be live-out to the enclosing scope. The number of outputs must match the number of outputs in the then_branch.
+        Graph to run if condition is false. Has N outputs: values you wish to be
+        live-out to the enclosing scope. The number of outputs must match the
+        number of outputs in the then_branch.
     then_branch
         Attribute.
-        Graph to run if condition is true. Has N outputs: values you wish to be live-out to the enclosing scope. The number of outputs must match the number of outputs in the else_branch.
+        Graph to run if condition is true. Has N outputs: values you wish to be
+        live-out to the enclosing scope. The number of outputs must match the
+        number of outputs in the else_branch.
 
     Returns
     =======
     outputs : Sequence[Var]
         Type V.
-        Values that are live-out to the enclosing scope. The return values in the `then_branch` and `else_branch` must be of the same data type. The `then_branch` and `else_branch` may produce tensors with the same element type and different shapes. If corresponding outputs from the then-branch and the else-branch have static shapes S1 and S2, then the shape of the corresponding output variable of the if-node (if present) must be compatible with both S1 and S2 as it represents the union of both possible shapes.For example, if in a model file, the first output of `then_branch` is typed float tensor with shape [2] and the first output of `else_branch` is another float tensor with shape [3], If's first output should have (a) no shape set, or (b) a shape of rank 1 with neither `dim_value` nor `dim_param` set, or (c) a shape of rank 1 with a unique `dim_param`. In contrast, the first output cannot have the shape [2] since [2] and [3] are not compatible.
+        Values that are live-out to the enclosing scope. The return values in
+        the ``then_branch`` and ``else_branch`` must be of the same data type.
+        The ``then_branch`` and ``else_branch`` may produce tensors with the
+        same element type and different shapes. If corresponding outputs from
+        the then-branch and the else-branch have static shapes S1 and S2, then
+        the shape of the corresponding output variable of the if-node (if
+        present) must be compatible with both S1 and S2 as it represents the
+        union of both possible shapes.For example, if in a model file, the first
+        output of ``then_branch`` is typed float tensor with shape [2] and the
+        first output of ``else_branch`` is another float tensor with shape [3],
+        If's first output should have (a) no shape set, or (b) a shape of rank 1
+        with neither ``dim_value`` nor ``dim_param`` set, or (c) a shape of rank
+        1 with a unique ``dim_param``. In contrast, the first output cannot have
+        the shape [2] since [2] and [3] are not compatible.
 
     Notes
     =====
@@ -7112,14 +7804,19 @@ def instance_normalization(
     r"""
     Carries out instance normalization as described in the paper
     https://arxiv.org/abs/1607.08022.
-    y = scale * (x - mean) / sqrt(variance + epsilon) + B,
-    where mean and variance are computed per instance per channel.
+
+    y = scale \* (x - mean) / sqrt(variance + epsilon) + B, where mean and
+    variance are computed per instance per channel.
 
     Parameters
     ==========
     input
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size.
     scale
         Type T.
         The input 1-dimensional scale tensor of size C.
@@ -7171,10 +7868,14 @@ def isinf(
         input
     detect_negative
         Attribute.
-        (Optional) Whether map negative infinity to true. Default to 1 so that negative infinity induces true. Set this attribute to 0 if negative infinity should be mapped to false.
+        (Optional) Whether map negative infinity to true. Default to 1 so that
+        negative infinity induces true. Set this attribute to 0 if negative
+        infinity should be mapped to false.
     detect_positive
         Attribute.
-        (Optional) Whether map positive infinity to true. Default to 1 so that positive infinity induces true. Set this attribute to 0 if positive infinity should be mapped to false.
+        (Optional) Whether map positive infinity to true. Default to 1 so that
+        positive infinity induces true. Set this attribute to 0 if positive
+        infinity should be mapped to false.
 
     Returns
     =======
@@ -7244,20 +7945,33 @@ def lrn(
     size: int,
 ) -> Var:
     r"""
-    Local Response Normalization proposed in the AlexNet paper (https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
-    It normalizes over local input regions.
-    The local region is defined across the channels. For an element X[n, c, d1, ..., dk] in a tensor
-    of shape (N x C x D1 x D2, ..., Dk), its region is
-    {X[n, i, d1, ..., dk] | max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1) / 2))}.
-    square_sum[n, c, d1, ..., dk] = sum(X[n, i, d1, ..., dk] ^ 2),
-    where max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1) / 2)).
-    Y[n, c, d1, ..., dk] = X[n, c, d1, ..., dk] / (bias + alpha / size * square_sum[n, c, d1, ..., dk] ) ^ beta
+    Local Response Normalization proposed in the `AlexNet
+    paper <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`__.
+    It normalizes over local input regions. The local region is defined
+    across the channels. For an element X[n, c, d1, ..., dk] in a tensor of
+    shape (N x C x D1 x D2, ..., Dk), its region is {X[n, i, d1, ..., dk] \|
+    max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1)
+    / 2))}.
+
+    square_sum[n, c, d1, ..., dk] = sum(X[n, i, d1, ..., dk] ^ 2), where
+    max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1)
+    / 2)).
+
+    Y[n, c, d1, ..., dk] = X[n, c, d1, ..., dk] / (bias + alpha / size \*
+    square_sum[n, c, d1, ..., dk] ) ^ beta
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size. Optionally, if dimension denotation is in
+        effect, the operation expects the input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     alpha
         Attribute.
         Scaling parameter.
@@ -7319,88 +8033,166 @@ def lstm(
     r"""
     Computes an one-layer LSTM. This operator is usually supported via some
     custom implementation such as CuDNN.
+
     Notations:
-    `X` - input tensor
-    `i` - input gate
-    `o` - output gate
-    `f` - forget gate
-    `c` - cell gate
-    `t` - time step (t-1 means previous time step)
-    `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates
-    `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates
-    `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates
-    `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates
-    `P[iof]`  - P peephole weight vector for input, output, and forget gates
-    `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates
-    `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates
-    `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates
-    `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates
-    `PB[iof]`  - P peephole weight vector for backward input, output, and forget gates
-    `H` - Hidden state
-    `num_directions` - 2 if direction == bidirectional else 1
+
+    ``X`` - input tensor
+
+    ``i`` - input gate
+
+    ``o`` - output gate
+
+    ``f`` - forget gate
+
+    ``c`` - cell gate
+
+    ``t`` - time step (t-1 means previous time step)
+
+    ``W[iofc]`` - W parameter weight matrix for input, output, forget, and
+    cell gates
+
+    ``R[iofc]`` - R recurrence weight matrix for input, output, forget, and
+    cell gates
+
+    ``Wb[iofc]`` - W bias vectors for input, output, forget, and cell gates
+
+    ``Rb[iofc]`` - R bias vectors for input, output, forget, and cell gates
+
+    ``P[iof]`` - P peephole weight vector for input, output, and forget
+    gates
+
+    ``WB[iofc]`` - W parameter weight matrix for backward input, output,
+    forget, and cell gates
+
+    ``RB[iofc]`` - R recurrence weight matrix for backward input, output,
+    forget, and cell gates
+
+    ``WBb[iofc]`` - W bias vectors for backward input, output, forget, and
+    cell gates
+
+    ``RBb[iofc]`` - R bias vectors for backward input, output, forget, and
+    cell gates
+
+    ``PB[iof]`` - P peephole weight vector for backward input, output, and
+    forget gates
+
+    ``H`` - Hidden state
+
+    ``num_directions`` - 2 if direction == bidirectional else 1
+
     Activation functions:
-      Relu(x)                - max(0, x)
-      Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
-      Sigmoid(x)             - 1/(1 + e^{-x})
-      (NOTE: Below are optional)
-      Affine(x)              - alpha*x + beta
-      LeakyRelu(x)           - x if x >= 0 else alpha * x
-      ThresholdedRelu(x)     - x if x >= alpha else 0
-      ScaledTanh(x)          - alpha*Tanh(beta*x)
-      HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
-      Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
-      Softsign(x)            - x/(1 + |x|)
-      Softplus(x)            - log(1 + e^x)
+
+    Relu(x) - max(0, x)
+
+    Tanh(x) - (1 - e^{-2x})/(1 + e^{-2x})
+
+    Sigmoid(x) - 1/(1 + e^{-x})
+
+    (NOTE: Below are optional)
+
+    Affine(x) - alpha*x + beta
+
+    LeakyRelu(x) - x if x >= 0 else alpha \* x
+
+    ThresholdedRelu(x) - x if x >= alpha else 0
+
+    ScaledTanh(x) - alpha\ *Tanh(beta*\ x)
+
+    HardSigmoid(x) - min(max(alpha*x + beta, 0), 1)
+
+    Elu(x) - x if x >= 0 else alpha*(e^x - 1)
+
+    Softsign(x) - x/(1 + \|x|)
+
+    Softplus(x) - log(1 + e^x)
+
     Equations (Default: f=Sigmoid, g=Tanh, h=Tanh):
-      - it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
-      - ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
-      - ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
-      - Ct = ft (.) Ct-1 + it (.) ct
-      - ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-      - Ht = ot (.) h(Ct)
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+    -  it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
+
+    -  ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
+
+    -  ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
+
+    -  Ct = ft (.) Ct-1 + it (.) ct
+
+    -  ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
+
+    -  Ht = ot (.) h(Ct) This operator has **optional** inputs/outputs. See
+       `the doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for
+       more details about the representation of optional arguments. An empty
+       string may be used in the place of an actual argument's name to
+       indicate a missing argument. Trailing optional arguments (those not
+       followed by an argument that is present) may also be simply omitted.
 
     Parameters
     ==========
     X
         Type T.
-        The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.
+        The input sequences packed (and potentially padded) into one 3-D tensor
+        with the shape of ``[seq_length, batch_size, input_size]``.
     W
         Type T.
-        The weight tensor for the gates. Concatenation of `W[iofc]` and `WB[iofc]` (if bidirectional) along dimension 0. The tensor has shape `[num_directions, 4*hidden_size, input_size]`.
+        The weight tensor for the gates. Concatenation of ``W[iofc]`` and
+        ``WB[iofc]`` (if bidirectional) along dimension 0. The tensor has shape
+        ``[num_directions, 4*hidden_size, input_size]``.
     R
         Type T.
-        The recurrence weight tensor. Concatenation of `R[iofc]` and `RB[iofc]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 4*hidden_size, hidden_size]`.
+        The recurrence weight tensor. Concatenation of ``R[iofc]`` and
+        ``RB[iofc]`` (if bidirectional) along dimension 0. This tensor has shape
+        ``[num_directions, 4*hidden_size, hidden_size]``.
     B
         Type T.
-        The bias tensor for input gate. Concatenation of `[Wb[iofc], Rb[iofc]]`, and `[WBb[iofc], RBb[iofc]]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 8*hidden_size]`. Optional: If not specified - assumed to be 0.
+        The bias tensor for input gate. Concatenation of
+        ``[Wb[iofc], Rb[iofc]]``, and ``[WBb[iofc], RBb[iofc]]`` (if
+        bidirectional) along dimension 0. This tensor has shape
+        ``[num_directions, 8*hidden_size]``. Optional: If not specified -
+        assumed to be 0.
     sequence_lens
         Type T1.
-        Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.
+        Optional tensor specifying lengths of the sequences in a batch. If not
+        specified - assumed all sequences in the batch to have length
+        ``seq_length``. It has shape ``[batch_size]``.
     initial_h
         Type T.
-        Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.
+        Optional initial value of the hidden. If not specified - assumed to be
+        0. It has shape ``[num_directions, batch_size, hidden_size]``.
     initial_c
         Type T.
-        Optional initial value of the cell. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.
+        Optional initial value of the cell. If not specified - assumed to be 0.
+        It has shape ``[num_directions, batch_size, hidden_size]``.
     P
         Type T.
-        The weight tensor for peepholes. Concatenation of `P[iof]` and `PB[iof]` (if bidirectional) along dimension 0. It has shape `[num_directions, 3*hidde_size]`. Optional: If not specified - assumed to be 0.
+        The weight tensor for peepholes. Concatenation of ``P[iof]`` and
+        ``PB[iof]`` (if bidirectional) along dimension 0. It has shape
+        ``[num_directions, 3*hidde_size]``. Optional: If not specified - assumed
+        to be 0.
     activation_alpha
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX
+        operators.For example with LeakyRelu, the default alpha is 0.01.
     activation_beta
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX operators.
     activations
         Attribute.
-        A list of 3 (or 6 if bidirectional) activation functions for input, output, forget, cell, and hidden. The activation functions must be one of the activation functions specified above. Optional: See the equations for default if not specified.
+        A list of 3 (or 6 if bidirectional) activation functions for input,
+        output, forget, cell, and hidden. The activation functions must be one
+        of the activation functions specified above. Optional: See the equations
+        for default if not specified.
     clip
         Attribute.
-        Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.
+        Cell clip threshold. Clipping bounds the elements of a tensor in the
+        range of [-threshold, +threshold] and is applied to the input of
+        activations. No clip if not specified.
     direction
         Attribute.
-        Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.
+        Specify if the RNN is forward, reverse, or bidirectional. Must be one of
+        forward (default), reverse, or bidirectional.
     hidden_size
         Attribute.
         Number of neurons in the hidden layer
@@ -7409,19 +8201,30 @@ def lstm(
         Couple the input and forget gates if 1.
     layout
         Attribute.
-        The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h, Y_c. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [batch_size, num_directions, hidden_size].
+        The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h,
+        Y_c. If 0, the following shapes are expected: X.shape = [seq_length,
+        batch_size, input_size], Y.shape = [seq_length, num_directions,
+        batch_size, hidden_size], initial_h.shape = Y_h.shape = initial_c.shape
+        = Y_c.shape = [num_directions, batch_size, hidden_size]. If 1, the
+        following shapes are expected: X.shape = [batch_size, seq_length,
+        input_size], Y.shape = [batch_size, seq_length, num_directions,
+        hidden_size], initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape
+        = [batch_size, num_directions, hidden_size].
 
     Returns
     =======
     Y : Var
         Type T.
-        A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`.
+        A tensor that concats all the intermediate output values of the hidden.
+        It has shape ``[seq_length, num_directions, batch_size, hidden_size]``.
     Y_h : Var
         Type T.
-        The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.
+        The last output value of the hidden. It has shape
+        ``[num_directions, batch_size, hidden_size]``.
     Y_c : Var
         Type T.
-        The last output value of the cell. It has shape `[num_directions, batch_size, hidden_size]`.
+        The last output value of the cell. It has shape
+        ``[num_directions, batch_size, hidden_size]``.
 
     Notes
     =====
@@ -7469,45 +8272,30 @@ def layer_normalization(
     stash_type: int = 1,
 ) -> Tuple[Var, Var, Var]:
     r"""
-    This is layer normalization defined in ONNX as function.
-          The overall computation can be split into two stages.
-          The first stage is standardization, which makes the
-          normalized elements have zero mean and unit variances.
-          The computation required by standardization can be
-          described by the following equations.
-          ```
-          Mean = ReduceMean<axes=normalized_axes>(X)
-          D = Sub(X, Mean)
-          DD = Mul(D, D)
-          Var = ReduceMean<axes=normalized_axes>(DD)
-          VarEps = Add(Var, epsilon)
-          StdDev = Sqrt(VarEps)
-          InvStdDev = Reciprocal(StdDev)
-          Normalized = Mul(D, InvStdDev)
-          ```
-          where `normalized_axes` is `[axis, ..., rank of X - 1]`.
-          The variables `Var` and `StdDev` stand for variance and
-          standard deviation, respectively. The second output is
-          `Mean` and the last one is `InvStdDev`.
-          Depending on `stash_type` attribute, the actual computation
-          must happen in different floating-point precision.
-          For example, if `stash_type` is 1, this operator casts
-          all input variables to 32-bit float, perform the computation, and
-          finally cast `Normalized` back to the original type of `X`.
-          The second stage then scales and shifts the outcome of the
-          first stage using
-          ```
-          NormalizedScaled = Mul(Normalized, Scale)
-          Y = Add(NormalizedScaled, B)
-          ```
-          The second stage doesn't depends on `stash_type`.
-          All equations are in this syntax (https://github.com/onnx/onnx/blob/main/docs/Syntax.md).
-          The same variable (i.e., input, output, and attribute) uses
-          the same name in the equations above and this operator's definition.
-          Let `d[i]` indicate the i-th dimension of `X`.
-          If `X`'s shape is `[d[0], ..., d[axis-1], d[axis], ..., d[rank-1]]`,
-          the shape of `Mean` and `InvStdDev` is `[d[0], ..., d[axis-1], 1, ..., 1]`.
-          `Y` and `X` have the same shape.
+    This is layer normalization defined in ONNX as function. The overall
+    computation can be split into two stages. The first stage is
+    standardization, which makes the normalized elements have zero mean and
+    unit variances. The computation required by standardization can be
+    described by the following equations.
+    ``Mean = ReduceMean<axes=normalized_axes>(X) D = Sub(X, Mean) DD = Mul(D, D) Var = ReduceMean<axes=normalized_axes>(DD) VarEps = Add(Var, epsilon) StdDev = Sqrt(VarEps) InvStdDev = Reciprocal(StdDev) Normalized = Mul(D, InvStdDev)``
+    where ``normalized_axes`` is ``[axis, ..., rank of X - 1]``. The
+    variables ``Var`` and ``StdDev`` stand for variance and standard
+    deviation, respectively. The second output is ``Mean`` and the last one
+    is ``InvStdDev``. Depending on ``stash_type`` attribute, the actual
+    computation must happen in different floating-point precision. For
+    example, if ``stash_type`` is 1, this operator casts all input variables
+    to 32-bit float, perform the computation, and finally cast
+    ``Normalized`` back to the original type of ``X``. The second stage then
+    scales and shifts the outcome of the first stage using
+    ``NormalizedScaled = Mul(Normalized, Scale) Y = Add(NormalizedScaled, B)``
+    The second stage doesn't depends on ``stash_type``. All equations are in
+    `this syntax <https://github.com/onnx/onnx/blob/main/docs/Syntax.md>`__.
+    The same variable (i.e., input, output, and attribute) uses the same
+    name in the equations above and this operator's definition. Let ``d[i]``
+    indicate the i-th dimension of ``X``. If ``X``'s shape is
+    ``[d[0], ..., d[axis-1], d[axis], ..., d[rank-1]]``, the shape of
+    ``Mean`` and ``InvStdDev`` is ``[d[0], ..., d[axis-1], 1, ..., 1]``.
+    ``Y`` and ``X`` have the same shape.
 
     Parameters
     ==========
@@ -7522,13 +8310,15 @@ def layer_normalization(
         Bias tensor.
     axis
         Attribute.
-        The first normalization dimension. If rank(X) is r, axis' allowed range is [-r, r]. Negative value means counting dimensions from the back.
+        The first normalization dimension. If rank(X) is r, axis' allowed range
+        is [-r, r]. Negative value means counting dimensions from the back.
     epsilon
         Attribute.
         The epsilon value to use to avoid division by zero.
     stash_type
         Attribute.
-        Type of Mean and InvStdDev. This also specifies stage one's computation precision.
+        Type of Mean and InvStdDev. This also specifies stage one's computation
+        precision.
 
     Returns
     =======
@@ -7540,7 +8330,8 @@ def layer_normalization(
         Saved mean used during training to speed up gradient computation
     InvStdDev : Var
         Type U.
-        Saved inverse standard deviation used during training to speed up gradient computation.
+        Saved inverse standard deviation used during training to speed up
+        gradient computation.
 
     Notes
     =====
@@ -7570,11 +8361,14 @@ def leaky_relu(
     alpha: float = 0.009999999776482582,
 ) -> Var:
     r"""
-    LeakyRelu takes input data (Tensor<T>) and an argument alpha, and produces one
-    output data (Tensor<T>) where the function `f(x) = alpha * x for x < 0`,
-    `f(x) = x for x >= 0`, is applied to the data tensor elementwise.
+    LeakyRelu takes input data (Tensor<T>) and an argument alpha, and
+    produces one output data (Tensor<T>) where the function
+    ``f(x) = alpha * x for x < 0``, ``f(x) = x for x >= 0``, is applied to
+    the data tensor elementwise.
+
     **History**
-    - Version 16 adds bfloat16 to the types allowed.
+
+    -  Version 16 adds bfloat16 to the types allowed.
 
     Parameters
     ==========
@@ -7613,9 +8407,13 @@ def less(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `less` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``less`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -7654,9 +8452,13 @@ def less_or_equal(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `less_equal` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``less_equal`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -7730,10 +8532,12 @@ def log_softmax(
 ) -> Var:
     r"""
     The operator computes the log of softmax values for the given input:
-     LogSoftmax(input, axis) = Log(Softmax(input, axis=axis))
-    The "axis" attribute indicates the dimension along which LogSoftmax
-    will be performed. The output tensor has the same shape
-    and contains the LogSoftmax values of the corresponding input.
+
+    LogSoftmax(input, axis) = Log(Softmax(input, axis=axis))
+
+    The "axis" attribute indicates the dimension along which LogSoftmax will
+    be performed. The output tensor has the same shape and contains the
+    LogSoftmax values of the corresponding input.
 
     Parameters
     ==========
@@ -7742,9 +8546,9 @@ def log_softmax(
         The input tensor of rank >= axis.
     axis
         Attribute.
-        Describes the dimension LogSoftmax will be performed on.
-        Negative value means counting dimensions
-        from the back. Accepted range is [-r, r-1] where r = rank(input).
+        Describes the dimension LogSoftmax will be performed on. Negative value
+        means counting dimensions from the back. Accepted range is [-r, r-1]
+        where r = rank(input).
 
     Returns
     =======
@@ -7777,139 +8581,180 @@ def loop(
     body: Callable[..., Iterable[Var]],
 ) -> Sequence[Var]:
     r"""
-    Generic Looping construct. This loop has multiple termination conditions:
-    1) Trip count. Iteration count specified at runtime. Set by
-       specifying the input M. Optional. Set to empty string to omit.
-       Note that a static trip count (specified at graph construction time) can be
+    Generic Looping construct. This loop has multiple termination
+    conditions:
+
+    1) Trip count. Iteration count specified at runtime. Set by specifying
+       the input M. Optional. Set to empty string to omit. Note that a
+       static trip count (specified at graph construction time) can be
        specified by passing in a constant node for input M.
-    2) Loop termination condition. This is an input to the op that determines
-       whether to run the first iteration and also a loop-carried dependency for
-       the body graph. The body graph must yield a value for the condition variable,
-       whether this input is provided or not.
-    This table summarizes the operating modes of this operator with equivalent
-    C-style code:
-        Operator inputs defined as (max_trip_count, condition_var).
-        input ("", ""):
-            for (int i=0; ; ++i) {
-              cond = ... // Note this value is ignored, but is required in the body
-            }
-        input ("", cond) // Note this is analogous to a while loop
-            bool cond = ...;
-            for (int i=0; cond; ++i) {
-              cond = ...;
-            }
-        input ("", 1) // Note this is analogous to a do-while loop
-            bool cond = true
-            for (int i=0; cond; ++i) {
-              cond = ...;
-            }
-        input (trip_count, "") // Note this is analogous to a for loop
-            int trip_count = ...
-            for (int i=0; i < trip_count; ++i) {
-              cond = ...; // ignored
-            }
-        input (trip_count, cond)
-            int trip_count = ...;
-            bool cond = ...;
-            for (int i=0; i < trip_count && cond; ++i) {
-              cond = ...;
-            }
+    2) Loop termination condition. This is an input to the op that
+       determines whether to run the first iteration and also a loop-carried
+       dependency for the body graph. The body graph must yield a value for
+       the condition variable, whether this input is provided or not.
+
+    This table summarizes the operating modes of this operator with
+    equivalent C-style code:
+
+    ::
+
+       Operator inputs defined as (max_trip_count, condition_var).
+
+       input ("", ""):
+           for (int i=0; ; ++i) {
+             cond = ... // Note this value is ignored, but is required in the body
+           }
+
+       input ("", cond) // Note this is analogous to a while loop
+           bool cond = ...;
+           for (int i=0; cond; ++i) {
+             cond = ...;
+           }
+
+       input ("", 1) // Note this is analogous to a do-while loop
+           bool cond = true
+           for (int i=0; cond; ++i) {
+             cond = ...;
+           }
+
+       input (trip_count, "") // Note this is analogous to a for loop
+           int trip_count = ...
+           for (int i=0; i < trip_count; ++i) {
+             cond = ...; // ignored
+           }
+
+       input (trip_count, cond)
+           int trip_count = ...;
+           bool cond = ...;
+           for (int i=0; i < trip_count && cond; ++i) {
+             cond = ...;
+           }
+
     *Sample usage - cond as well as trip count*
-        graph predict-net {
-          %a = Constant[value = <Scalar Tensor [3]>]()
-          %b = Constant[value = <Scalar Tensor [6]>]()
-          %keepgoing = Constant[value = <Scalar Tensor [1]>]()
-          %max_trip_count = Constant[value = <Scalar Tensor [10]>]()
-          %keepgoing_out, %b_out, %user_defined_vals = Loopbody = <graph body-net> (%max_trip_count, %keepgoing, %b)
-          return
-        }
-        graph body-net (
-          %i[INT32, scalar]           // iteration number
-          %keepgoing_in[BOOL, scalar] // incoming loop-termination-condition; not used
-          %b_in[INT32, scalar]        // incoming value of loop-carried-dependency b
-        ) {
-          %my_local = Add(%a, %b_in)
-          %b_out = Sub(%a, %b_in) // outgoing value of loop-carried-dependency b
-          %keepgoing_out = Greater(%my_local, %b_out) // outgoing loop-termination-condition
-          %user_defined_val = Add(%b_in, %b_in) // scan-output value to be accumulated
-          return %keepgoing_out, %b_out, %user_defined_val
-        }
+
+    ::
+
+       graph predict-net {
+         %a = Constant[value = <Scalar Tensor [3]>]()
+         %b = Constant[value = <Scalar Tensor [6]>]()
+         %keepgoing = Constant[value = <Scalar Tensor [1]>]()
+         %max_trip_count = Constant[value = <Scalar Tensor [10]>]()
+         %keepgoing_out, %b_out, %user_defined_vals = Loop[body = <graph body-net>](%max_trip_count, %keepgoing, %b)
+         return
+       }
+
+       graph body-net (
+         %i[INT32, scalar]           // iteration number
+         %keepgoing_in[BOOL, scalar] // incoming loop-termination-condition; not used
+         %b_in[INT32, scalar]        // incoming value of loop-carried-dependency b
+       ) {
+         %my_local = Add(%a, %b_in)
+         %b_out = Sub(%a, %b_in) // outgoing value of loop-carried-dependency b
+         %keepgoing_out = Greater(%my_local, %b_out) // outgoing loop-termination-condition
+         %user_defined_val = Add(%b_in, %b_in) // scan-output value to be accumulated
+         return %keepgoing_out, %b_out, %user_defined_val
+       }
+
     *Sample equivalent C code*
-        {
-          /* User-defined code (enclosing scope) */
-          int a = 3, b = 6;
-          bool keepgoing = true; // Analogous to input cond
-          /* End user-defined code */
-          /* Implicitly-defined code */
-          const int max_trip_count = 10; // Analogous to input M
-          int user_defined_vals[]; // Imagine this is resizable
-          /* End implicitly-defined code */
-          /* initialize loop-carried variables and scan-output variables */
-          bool keepgoing_out = keepgoing
-          int b_out = b
-          for (int i=0; i < max_trip_count && keepgoing_out; ++i) {
-            /* Implicitly-defined code: bind actual parameter values
-               to formal parameter variables of loop-body */
-            bool keepgoing_in = keepgoing_out;
-            bool b_in = b_out;
-            /* User-defined code (loop body) */
-            int my_local = a + b_in; // Reading value "a" from the enclosing scope is fine
-            b_out = a - b_in;
-            keepgoing_out = my_local > b_out;
-            user_defined_val = b_in + b_in; // b_in and b_out are different variables
-            /* End user-defined code */
-            /* Implicitly defined-code */
-            user_defined_vals[i] = user_defined_val // accumulate scan-output values
-          }
-          // int t = my_local; // Can't do this. my_local is not accessible here.
-          // The values below are bound to the output variables of the loop and therefore accessible
-          // b_out; user_defined_vals; keepgoing_out;
-        }
+
+    ::
+
+       {
+         /* User-defined code (enclosing scope) */
+         int a = 3, b = 6;
+         bool keepgoing = true; // Analogous to input cond
+         /* End user-defined code */
+
+         /* Implicitly-defined code */
+         const int max_trip_count = 10; // Analogous to input M
+         int user_defined_vals[]; // Imagine this is resizable
+         /* End implicitly-defined code */
+         /* initialize loop-carried variables and scan-output variables */
+         bool keepgoing_out = keepgoing
+         int b_out = b
+
+         for (int i=0; i < max_trip_count && keepgoing_out; ++i) {
+           /* Implicitly-defined code: bind actual parameter values
+              to formal parameter variables of loop-body */
+           bool keepgoing_in = keepgoing_out;
+           bool b_in = b_out;
+
+           /* User-defined code (loop body) */
+           int my_local = a + b_in; // Reading value "a" from the enclosing scope is fine
+           b_out = a - b_in;
+           keepgoing_out = my_local > b_out;
+           user_defined_val = b_in + b_in; // b_in and b_out are different variables
+           /* End user-defined code */
+
+           /* Implicitly defined-code */
+           user_defined_vals[i] = user_defined_val // accumulate scan-output values
+         }
+         // int t = my_local; // Can't do this. my_local is not accessible here.
+
+         // The values below are bound to the output variables of the loop and therefore accessible
+         // b_out; user_defined_vals; keepgoing_out;
+       }
+
     There are several things of note in this code snippet:
-    1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
-       be referenced in the inputs of the loop.
-    2) Any values computed in the loop body that needs to be used in a subsequent
-       iteration or after the loop are modelled using a pair of variables in the loop-body,
-       consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
-       These are referred to as loop-carried dependences. The loop operation node
-       supplies the input value of the input variable for the first iteration, and
-       returns the output value of the output variable produced by the final
-       iteration.
-    3) Scan_output variables are used to implicitly concatenate values computed across
-       all the iterations. In the above example, the value of user_defined_val computed
-       over all iterations are concatenated and returned as the value of user_defined_vals
-       after the loop.
+
+    1) Values from the enclosing scope (i.e. variable "a" here) are in scope
+       and can be referenced in the inputs of the loop.
+    2) Any values computed in the loop body that needs to be used in a
+       subsequent iteration or after the loop are modelled using a pair of
+       variables in the loop-body, consisting of an input variable (eg.,
+       b_in) and an output variable (eg., b_out). These are referred to as
+       loop-carried dependences. The loop operation node supplies the input
+       value of the input variable for the first iteration, and returns the
+       output value of the output variable produced by the final iteration.
+    3) Scan_output variables are used to implicitly concatenate values
+       computed across all the iterations. In the above example, the value
+       of user_defined_val computed over all iterations are concatenated and
+       returned as the value of user_defined_vals after the loop.
     4) Values created in the body cannot be accessed in the enclosing scope,
        except using the mechanism described above.
-    Note that the semantics of this op support "diagonal" or "wavefront" execution.
-    (See Step 3 here for an example:
+
+    Note that the semantics of this op support "diagonal" or "wavefront"
+    execution. (See Step 3 here for an example:
     https://devblogs.nvidia.com/optimizing-recurrent-neural-networks-cudnn-5/).
-    Frontends should emit multi-layer RNNs as a series of While operators (with
-    time being the inner looping dimension), with each successive layer consuming
-    the scan_outputs from the previous layer, possibly going through several
-    point-wise operators (e.g. dropout, residual connections, linear layer).
-    The input/output of subgraph (produced by loop node) matching is based on order instead of name. The implementation will figure out the names based on this order.
+    Frontends should emit multi-layer RNNs as a series of While operators
+    (with time being the inner looping dimension), with each successive
+    layer consuming the scan_outputs from the previous layer, possibly going
+    through several point-wise operators (e.g. dropout, residual
+    connections, linear layer).
+
+    The input/output of subgraph (produced by loop node) matching is based
+    on order instead of name. The implementation will figure out the names
+    based on this order.
 
     Parameters
     ==========
     M
         Type I.
-        A maximum trip-count for the loop specified at runtime. Optional. Pass empty string to skip.
+        A maximum trip-count for the loop specified at runtime. Optional. Pass
+        empty string to skip.
     cond
         Type B.
         A boolean termination condition. Optional. Pass empty string to skip.
     v_initial
         Type V.
-        The initial values of any loop-carried dependencies (values that change across loop iterations)
+        The initial values of any loop-carried dependencies (values that change
+        across loop iterations)
     body
         Attribute.
-        The graph run each iteration. It has 2+N inputs: (iteration_num, condition, loop carried dependencies...). It has 1+N+K outputs: (condition, loop carried dependencies..., scan_outputs...). Each scan_output is created by concatenating the value of the specified output value at the end of each iteration of the loop. It is an error if the dimensions or data type of these scan_outputs change across loop iterations.
+        The graph run each iteration. It has 2+N inputs: (iteration_num,
+        condition, loop carried dependencies...). It has 1+N+K outputs:
+        (condition, loop carried dependencies..., scan_outputs...). Each
+        scan_output is created by concatenating the value of the specified
+        output value at the end of each iteration of the loop. It is an error if
+        the dimensions or data type of these scan_outputs change across loop
+        iterations.
 
     Returns
     =======
     v_final_and_scan_outputs : Sequence[Var]
         Type V.
-        Final N loop carried dependency values then K scan_outputs. Scan outputs must be Tensors.
+        Final N loop carried dependency values then K scan_outputs. Scan outputs
+        must be Tensors.
 
     Notes
     =====
@@ -7993,20 +8838,31 @@ def lp_pool(
     strides: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    LpPool consumes an input tensor X and applies Lp pooling across
-     the tensor according to kernel sizes, stride sizes, and pad lengths.
-     Lp pooling consisting of computing the Lp norm on all values of a subset
-     of the input tensor according to the kernel size and downsampling the
-     data into the output tensor Y for further processing.
+    LpPool consumes an input tensor X and applies Lp pooling across the
+    tensor according to kernel sizes, stride sizes, and pad lengths. Lp
+    pooling consisting of computing the Lp norm on all values of a subset of
+    the input tensor according to the kernel size and downsampling the data
+    into the output tensor Y for further processing.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size.
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     kernel_shape
         Attribute.
         The size of the kernel along each axis.
@@ -8015,16 +8871,26 @@ def lp_pool(
         p value of the Lp norm used to pool over the input data.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor from Lp pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes.
+        Output data tensor from Lp pooling across the input tensor. Dimensions
+        will vary based on various kernel, stride, and pad sizes.
 
     Notes
     =====
@@ -8052,7 +8918,8 @@ def mat_mul(
     B: Var,
 ) -> Var:
     r"""
-    Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+    Matrix product that behaves like numpy.matmul:
+    https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
 
     Parameters
     ==========
@@ -8067,7 +8934,7 @@ def mat_mul(
     =======
     Y : Var
         Type T.
-        Matrix multiply results from A * B
+        Matrix multiply results from A \* B
 
     Notes
     =====
@@ -8092,8 +8959,10 @@ def mat_mul_integer(
     b_zero_point: Optional[Var] = None,
 ) -> Var:
     r"""
-    Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
-    The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.
+    Matrix product that behaves like numpy.matmul:
+    https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+    The production MUST never overflow. The accumulation may overflow if and
+    only if in 32 bits.
 
     Parameters
     ==========
@@ -8105,16 +8974,26 @@ def mat_mul_integer(
         N-dimensional matrix B
     a_zero_point
         Type T1.
-        Zero point tensor for input 'A'. It's optional and default value is 0. It could be a scalar or N-D tensor. Scalar refers to per tensor quantization whereas N-D refers to per row quantization. If the input is 2D of shape [M, K] then zero point tensor may be an M element vector [zp_1, zp_2, ..., zp_M]. If the input is N-D tensor with shape [D1, D2, M, K] then zero point tensor may have shape [D1, D2, M, 1].
+        Zero point tensor for input 'A'. It's optional and default value is 0.
+        It could be a scalar or N-D tensor. Scalar refers to per tensor
+        quantization whereas N-D refers to per row quantization. If the input is
+        2D of shape [M, K] then zero point tensor may be an M element vector
+        [zp_1, zp_2, ..., zp_M]. If the input is N-D tensor with shape [D1, D2,
+        M, K] then zero point tensor may have shape [D1, D2, M, 1].
     b_zero_point
         Type T2.
-        Zero point tensor for input 'B'. It's optional and default value is 0. It could be a scalar or a N-D tensor, Scalar refers to per tensor quantization whereas N-D refers to per col quantization. If the input is 2D of shape [K, N] then zero point tensor may be an N element vector [zp_1, zp_2, ..., zp_N]. If the input is N-D tensor with shape [D1, D2, K, N] then zero point tensor may have shape [D1, D2, 1, N].
+        Zero point tensor for input 'B'. It's optional and default value is 0.
+        It could be a scalar or a N-D tensor, Scalar refers to per tensor
+        quantization whereas N-D refers to per col quantization. If the input is
+        2D of shape [K, N] then zero point tensor may be an N element vector
+        [zp_1, zp_2, ..., zp_N]. If the input is N-D tensor with shape [D1, D2,
+        K, N] then zero point tensor may have shape [D1, D2, 1, N].
 
     Returns
     =======
     Y : Var
         Type T3.
-        Matrix multiply results from A * B
+        Matrix multiply results from A \* B
 
     Notes
     =====
@@ -8140,9 +9019,11 @@ def max(
     data_0: Sequence[Var],
 ) -> Var:
     r"""
-    Element-wise max of each of the input tensors (with Numpy-style broadcasting support).
-    All inputs and outputs must have the same data type.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Element-wise max of each of the input tensors (with Numpy-style
+    broadcasting support). All inputs and outputs must have the same data
+    type. This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -8183,68 +9064,113 @@ def max_pool(
     strides: Optional[Iterable[int]] = None,
 ) -> Tuple[Var, Var]:
     r"""
-    MaxPool consumes an input tensor X and applies max pooling across
-     the tensor according to kernel sizes, stride sizes, and pad lengths.
-     max pooling consisting of computing the max on all values of a
-     subset of the input tensor according to the kernel size and downsampling the
-     data into the output tensor Y for further processing. The output spatial shape will be following:
-     ```
-     output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
-     ```
-     or
-     ```
-     output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
-     ```
-     if ceil_mode is enabled
-     ```
-     * pad_shape[i] is sum of pads along axis i
-     ```
-     `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
-     ```
-     VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
-     SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
-     ```
-     And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
-     ```
-     pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
-     ```
-     The output of each pooling window is maximum number of elements exclude pad.
+    MaxPool consumes an input tensor X and applies max pooling across the
+    tensor according to kernel sizes, stride sizes, and pad lengths. max
+    pooling consisting of computing the max on all values of a subset of the
+    input tensor according to the kernel size and downsampling the data into
+    the output tensor Y for further processing. The output spatial shape
+    will be following:
+
+    ::
+
+       output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+
+    or
+
+    ::
+
+       output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+
+    if ceil_mode is enabled
+
+    ::
+
+       * pad_shape[i] is sum of pads along axis i
+
+    ``auto_pad`` is a DEPRECATED attribute. If you are using them currently,
+    the output spatial shape will be following:
+
+    ::
+
+       VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
+       SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+
+    And pad shape will be following if ``SAME_UPPER`` or ``SAME_LOWER``:
+
+    ::
+
+       pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
+
+    The output of each pooling window is maximum number of elements exclude
+    pad.
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data. For non
+        image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn),
+        where N is the batch size. Optionally, if dimension denotation is in
+        effect, the operation expects the input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     ceil_mode
         Attribute.
         Whether to use ceil or floor (default) to compute the output shape.
     dilations
         Attribute.
-        Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.
+        Dilation value along each spatial axis of filter. If not present, the
+        dilation defaults to 1 along each spatial axis.
     kernel_shape
         Attribute.
         The size of the kernel along each axis.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     storage_order
         Attribute.
-        The storage order of the tensor. 0 is row major, and 1 is column major. This attribute is used only to convert an n-tuple index value into a single integer value for producing the second output.
+        The storage order of the tensor. 0 is row major, and 1 is column major.
+        This attribute is used only to convert an n-tuple index value into a
+        single integer value for producing the second output.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
     Y : Var
         Type T.
-        Output data tensor from average or max pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes. Floor value of the dimension is used
+        Output data tensor from average or max pooling across the input tensor.
+        Dimensions will vary based on various kernel, stride, and pad sizes.
+        Floor value of the dimension is used
     Indices : Var
         Type I.
-        Indices tensor from max pooling across the input tensor. The dimensions of indices are the same as output tensor. The values in indices of are the indices of the selected values during pooling. The indices are computed as flatten 1-D tensor, and the indices do not consider padding. So the values in indices are in [0, N x C x D1 x ... x Dn).
+        Indices tensor from max pooling across the input tensor. The dimensions
+        of indices are the same as output tensor. The values in indices of are
+        the indices of the selected values during pooling. The indices are
+        computed as flatten 1-D tensor, and the indices do not consider padding.
+        So the values in indices are in [0, N x C x D1 x ... x Dn).
 
     Notes
     =====
@@ -8278,30 +9204,35 @@ def max_roi_pool(
     spatial_scale: float = 1.0,
 ) -> Var:
     r"""
-    ROI max pool consumes an input tensor X and region of interests (RoIs) to
-     apply max pooling across each RoI, to produce output 4-D tensor of shape
-     (num_rois, channels, pooled_shape[0], pooled_shape[1]).
+    ROI max pool consumes an input tensor X and region of interests (RoIs)
+    to apply max pooling across each RoI, to produce output 4-D tensor of
+    shape (num_rois, channels, pooled_shape[0], pooled_shape[1]).
 
     Parameters
     ==========
     X
         Type T.
-        Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data.
+        Input data tensor from the previous operator; dimensions for image case
+        are (N x C x H x W), where N is the batch size, C is the number of
+        channels, and H and W are the height and the width of the data.
     rois
         Type T.
-        RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].
+        RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of shape
+        (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].
     pooled_shape
         Attribute.
         ROI pool output shape (height, width).
     spatial_scale
         Attribute.
-        Multiplicative spatial scale factor to translate ROI coordinates from their input scale to the scale used when pooling.
+        Multiplicative spatial scale factor to translate ROI coordinates from
+        their input scale to the scale used when pooling.
 
     Returns
     =======
     Y : Var
         Type T.
-        RoI pooled output 4-D tensor of shape (num_rois, channels, pooled_shape[0], pooled_shape[1]).
+        RoI pooled output 4-D tensor of shape (num_rois, channels,
+        pooled_shape[0], pooled_shape[1]).
 
     Notes
     =====
@@ -8333,41 +9264,76 @@ def max_unpool(
 ) -> Var:
     r"""
     MaxUnpool essentially computes the partial inverse of the MaxPool op.
-     The input information to this op is typically the output information from a MaxPool op. The first
-     input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-     from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
-     to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
-     The third (optional) input is a tensor that specifies the output size of the unpooling operation.
-    MaxUnpool is intended to do 'partial' inverse of the MaxPool op. 'Partial' because all the non-maximal
-     values from the original input to MaxPool are set to zero in the output of the MaxUnpool op. Pooling
-     the result of an unpooling operation should give back the original input to the unpooling op.
-    MaxUnpool can produce the same output size for several input sizes, which makes unpooling op ambiguous.
-     The third input argument, output_size, is meant to disambiguate the op and produce output tensor of
-     known/predictable size.
-    In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-     which define the exact unpooling op. The attributes typically have the same values as the corrsponding
-     pooling op that the unpooling op is trying to invert.
+    The input information to this op is typically the output information
+    from a MaxPool op. The first input tensor X is the tensor that needs to
+    be unpooled, which is typically the pooled tensor (first output) from
+    MaxPool. The second input tensor, I, contains the indices to the
+    (locally maximal) elements corrsponding to the elements in the first
+    input tensor X. Input tensor I is typically the second output of the
+    MaxPool op. The third (optional) input is a tensor that specifies the
+    output size of the unpooling operation.
+
+    MaxUnpool is intended to do 'partial' inverse of the MaxPool op.
+    'Partial' because all the non-maximal values from the original input to
+    MaxPool are set to zero in the output of the MaxUnpool op. Pooling the
+    result of an unpooling operation should give back the original input to
+    the unpooling op.
+
+    MaxUnpool can produce the same output size for several input sizes,
+    which makes unpooling op ambiguous. The third input argument,
+    output_size, is meant to disambiguate the op and produce output tensor
+    of known/predictable size.
+
+    In addition to the inputs, MaxUnpool takes three attributes, namely
+    kernel_shape, strides, and pads, which define the exact unpooling op.
+    The attributes typically have the same values as the corrsponding
+    pooling op that the unpooling op is trying to invert.
 
     Parameters
     ==========
     X
         Type T1.
-        Input data tensor that has to be unpooled. This tensor is typically the first output of the MaxPool op.Dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non-image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor that has to be unpooled. This tensor is typically the
+        first output of the MaxPool op.Dimensions for image case are (N x C x H
+        x W), where N is the batch size, C is the number of channels, and H and
+        W are the height and the width of the data. For non-image case, the
+        dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the
+        batch size. Optionally, if dimension denotation is in effect, the
+        operation expects the input data tensor to arrive with the dimension
+        denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE
+        ...].
     I
         Type T2.
-        Input data tensor containing the indices corresponding to elements in the first input tensor X.This tensor is typically the second output of the MaxPool op.Dimensions must be the same as input tensor X. The indices are linear, i.e. computed considering the tensor as flattened 1-D tensor, assuming row-major storage. Also, the linear indices should not consider padding. So the values in indices are in the range [0, N x C x D1 x ... x Dn).
+        Input data tensor containing the indices corresponding to elements in
+        the first input tensor X.This tensor is typically the second output of
+        the MaxPool op.Dimensions must be the same as input tensor X. The
+        indices are linear, i.e. computed considering the tensor as flattened
+        1-D tensor, assuming row-major storage. Also, the linear indices should
+        not consider padding. So the values in indices are in the range [0, N x
+        C x D1 x ... x Dn).
     output_shape
         Type T2.
-        The shape of the output can be explicitly set which will cause pads values to be auto generated. If 'output_shape' is specified, 'pads' values are ignored.
+        The shape of the output can be explicitly set which will cause pads
+        values to be auto generated. If 'output_shape' is specified, 'pads'
+        values are ignored.
     kernel_shape
         Attribute.
         The size of the kernel along each axis.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0. The value represent the
+        number of pixels added to the beginning and end part of the
+        corresponding axis. ``pads`` format should be as follow [x1_begin,
+        x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels
+        added at the beginning of axis ``i`` and xi_end, the number of pixels
+        added at the end of axis ``i``. This attribute cannot be used
+        simultaneously with auto_pad attribute. If not present, the padding
+        defaults to 0 along start and end of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
@@ -8401,9 +9367,11 @@ def mean(
     data_0: Sequence[Var],
 ) -> Var:
     r"""
-    Element-wise mean of each of the input tensors (with Numpy-style broadcasting support).
-    All inputs and outputs must have the same data type.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Element-wise mean of each of the input tensors (with Numpy-style
+    broadcasting support). All inputs and outputs must have the same data
+    type. This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -8438,8 +9406,9 @@ def mean_variance_normalization(
     axes: Iterable[int] = (0, 2, 3),
 ) -> Var:
     r"""
-    A MeanVarianceNormalization Function: Perform mean variance normalization
-          on the input tensor X using formula: <br/> ``` (X-EX)/sqrt(E(X-EX)^2) ```
+    A MeanVarianceNormalization Function: Perform mean variance
+    normalization on the input tensor X using formula:
+    ``(X-EX)/sqrt(E(X-EX)^2)``
 
     Parameters
     ==========
@@ -8448,7 +9417,10 @@ def mean_variance_normalization(
         Input tensor
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to caculate along axes [0,2,3] for calculating mean and variance along each channel. Two variables with the same C-coordinate are associated with the same mean and variance.
+        A list of integers, along which to reduce. The default is to caculate
+        along axes [0,2,3] for calculating mean and variance along each channel.
+        Two variables with the same C-coordinate are associated with the same
+        mean and variance.
 
     Returns
     =======
@@ -8483,11 +9455,24 @@ def mel_weight_matrix(
     output_datatype: int = 1,
 ) -> Var:
     r"""
-    Generate a MelWeightMatrix that can be used to re-weight a Tensor containing a linearly sampled frequency spectra (from DFT or STFT) into num_mel_bins frequency information based on the [lower_edge_hertz, upper_edge_hertz] range on the mel scale.
-    This function defines the mel scale in terms of a frequency in hertz according to the following formula:
-        mel(f) = 2595 * log10(1 + f/700)
-    In the returned matrix, all the triangles (filterbanks) have a peak value of 1.0.
-    The returned MelWeightMatrix can be used to right-multiply a spectrogram S of shape [frames, num_spectrogram_bins] of linear scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram" M of shape [frames, num_mel_bins].
+    Generate a MelWeightMatrix that can be used to re-weight a Tensor
+    containing a linearly sampled frequency spectra (from DFT or STFT) into
+    num_mel_bins frequency information based on the [lower_edge_hertz,
+    upper_edge_hertz] range on the mel scale. This function defines the mel
+    scale in terms of a frequency in hertz according to the following
+    formula:
+
+    ::
+
+       mel(f) = 2595 * log10(1 + f/700)
+
+    In the returned matrix, all the triangles (filterbanks) have a peak
+    value of 1.0.
+
+    The returned MelWeightMatrix can be used to right-multiply a spectrogram
+    S of shape [frames, num_spectrogram_bins] of linear scale spectrum
+    values (e.g. STFT magnitudes) to generate a "mel spectrogram" M of shape
+    [frames, num_mel_bins].
 
     Parameters
     ==========
@@ -8496,25 +9481,34 @@ def mel_weight_matrix(
         The number of bands in the mel spectrum.
     dft_length
         Type T1.
-        The size of the original DFT. The size of the original DFT is used to infer the size of the onesided DFT, which is understood to be floor(dft_length/2) + 1, i.e. the spectrogram only contains the nonredundant DFT bins.
+        The size of the original DFT. The size of the original DFT is used to
+        infer the size of the onesided DFT, which is understood to be
+        floor(dft_length/2) + 1, i.e. the spectrogram only contains the
+        nonredundant DFT bins.
     sample_rate
         Type T1.
-        Samples per second of the input signal used to create the spectrogram. Used to figure out the frequencies corresponding to each spectrogram bin, which dictates how they are mapped into the mel scale.
+        Samples per second of the input signal used to create the spectrogram.
+        Used to figure out the frequencies corresponding to each spectrogram
+        bin, which dictates how they are mapped into the mel scale.
     lower_edge_hertz
         Type T2.
-        Lower bound on the frequencies to be included in the mel spectrum. This corresponds to the lower edge of the lowest triangular band.
+        Lower bound on the frequencies to be included in the mel spectrum. This
+        corresponds to the lower edge of the lowest triangular band.
     upper_edge_hertz
         Type T2.
         The desired top edge of the highest frequency band.
     output_datatype
         Attribute.
-        The data type of the output tensor. Strictly must be one of the values from DataType enum in TensorProto whose values correspond to T3. The default value is 1 = FLOAT.
+        The data type of the output tensor. Strictly must be one of the values
+        from DataType enum in TensorProto whose values correspond to T3. The
+        default value is 1 = FLOAT.
 
     Returns
     =======
     output : Var
         Type T3.
-        The Mel Weight Matrix. The output has the shape: [floor(dft_length/2) + 1][num_mel_bins].
+        The Mel Weight Matrix. The output has the shape: [floor(dft_length/2) +
+        1][num_mel_bins].
 
     Notes
     =====
@@ -8543,9 +9537,11 @@ def min(
     data_0: Sequence[Var],
 ) -> Var:
     r"""
-    Element-wise min of each of the input tensors (with Numpy-style broadcasting support).
-    All inputs and outputs must have the same data type.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Element-wise min of each of the input tensors (with Numpy-style
+    broadcasting support). All inputs and outputs must have the same data
+    type. This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -8581,15 +9577,23 @@ def mod(
     fmod: int = 0,
 ) -> Var:
     r"""
-    Performs element-wise binary modulus (with Numpy-style broadcasting support).
-        The sign of the remainder is the same as that of the Divisor.
-        Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend
-        (in contrast to integer mod). To force a behavior like numpy.fmod() an 'fmod' Attribute is provided.
-        This attribute is set to 0 by default causing the behavior to be like integer mod.
-        Setting this attribute to 1 causes the remainder to be calculated similar to that of numpy.fmod().
-        If the input type is floating point, then `fmod` attribute must be set to 1.
-        In case of dividend being zero, the results will be platform dependent.
-      This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Performs element-wise binary modulus (with Numpy-style broadcasting
+    support). The sign of the remainder is the same as that of the Divisor.
+
+    ::
+
+       Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend
+       (in contrast to integer mod). To force a behavior like numpy.fmod() an 'fmod' Attribute is provided.
+       This attribute is set to 0 by default causing the behavior to be like integer mod.
+       Setting this attribute to 1 causes the remainder to be calculated similar to that of numpy.fmod().
+
+       If the input type is floating point, then `fmod` attribute must be set to 1.
+
+       In case of dividend being zero, the results will be platform dependent.
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -8601,7 +9605,8 @@ def mod(
         Divisor tensor
     fmod
         Attribute.
-        Whether the operator should behave like fmod (default=0 meaning it will do integer mods); Set this to 1 to force fmod treatment
+        Whether the operator should behave like fmod (default=0 meaning it will
+        do integer mods); Set this to 1 to force fmod treatment
 
     Returns
     =======
@@ -8632,9 +9637,15 @@ def mul(
     B: Var,
 ) -> Var:
     r"""
-    Performs element-wise binary multiplication (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
-    (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
+    Performs element-wise binary multiplication (with Numpy-style
+    broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+
+    (Opset 14 change): Extend supported types to include uint8, int8,
+    uint16, and int16.
 
     Parameters
     ==========
@@ -8675,29 +9686,36 @@ def multinomial(
     seed: Optional[float] = None,
 ) -> Var:
     r"""
-    Generate a tensor of samples from a multinomial distribution according to the probabilities
-    of each of the possible outcomes.
+    Generate a tensor of samples from a multinomial distribution according
+    to the probabilities of each of the possible outcomes.
 
     Parameters
     ==========
     input
         Type T1.
-        Input tensor with shape [batch_size, class_size], where class_size is the number of all possible outcomes. Each value along the axis zero represents the unnormalized log-probability of each corresponding outcome in a batch.
+        Input tensor with shape [batch_size, class_size], where class_size is
+        the number of all possible outcomes. Each value along the axis zero
+        represents the unnormalized log-probability of each corresponding
+        outcome in a batch.
     dtype
         Attribute.
-        (Optional) The data type for the elements of the output tensor, if not specified, we will use int32.
+        (Optional) The data type for the elements of the output tensor, if not
+        specified, we will use int32.
     sample_size
         Attribute.
         Number of times to sample.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
 
     Returns
     =======
     output : Var
         Type T2.
-        Output tensor with shape [batch_size, sample_size], where sample_size is the number of times to sample. Each value along the axis zero represents the outcome of the corresponding sample in a batch.
+        Output tensor with shape [batch_size, sample_size], where sample_size is
+        the number of times to sample. Each value along the axis zero represents
+        the outcome of the corresponding sample in a batch.
 
     Notes
     =====
@@ -8724,8 +9742,8 @@ def neg(
 ) -> Var:
     r"""
     Neg takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where each element flipped sign, y = -x, is applied to
-    the tensor elementwise.
+    (Tensor<T>) where each element flipped sign, y = -x, is applied to the
+    tensor elementwise.
 
     Parameters
     ==========
@@ -8763,71 +9781,113 @@ def negative_log_likelihood_loss(
     reduction: str = "mean",
 ) -> Var:
     r"""
-    A NegativeLogLikelihoodLoss operator computes (weighted) negative log likelihood loss.
-    Its "input" tensor has the shape of (N, C, d1, d2, ..., dk) where k >= 0.
-    The "input" tensor contains log-probabilities for input[n, :, d_1, d_2,..., d_k] being in a class of [0, C).
-    The operator's "target" input tensor has the shape of (N, d1, d2, ..., dk). It encodes class labels (one of C classes)
-    or it may contain a special value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x dk samples.
-    The loss value for input[n, :, d_1, d_2,...d_k] being classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
-        loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
+    A NegativeLogLikelihoodLoss operator computes (weighted) negative log
+    likelihood loss. Its "input" tensor has the shape of (N, C, d1, d2, ...,
+    dk) where k >= 0. The "input" tensor contains log-probabilities for
+    input[n, :, d_1, d_2,..., d_k] being in a class of [0, C). The
+    operator's "target" input tensor has the shape of (N, d1, d2, ..., dk).
+    It encodes class labels (one of C classes) or it may contain a special
+    value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x
+    dk samples. The loss value for input[n, :, d_1, d_2,...d_k] being
+    classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
+
+    ::
+
+       loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
+
     When an optional "weight" is provided, the sample loss is calculated as:
-        loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
+
+    ::
+
+       loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
+
     loss is zero for the case when target-value equals ignore_index.
-        loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
-    If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
-    If "reduction" attribute is set to "mean" (the default attribute value), the output loss is (weight) averaged:
-        mean(loss), if "weight" is not provided,
+
+    ::
+
+       loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
+
+    If "reduction" attribute is set to "none", the operator's output will be
+    the above loss with shape (N, d1, d2, ..., dk). If "reduction" attribute
+    is set to "mean" (the default attribute value), the output loss is
+    (weight) averaged:
+
+    ::
+
+       mean(loss), if "weight" is not provided,
+
     or if weight is provided,
-        sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
+
+    ::
+
+       sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
+
     If "reduction" attribute is set to "sum", the output is a scalar:
-        sum(loss).
+    sum(loss).
+
     See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+
     Example 1:
-        // negative log likelihood loss, "none" reduction
-        N, C, d1 = 2, 3, 2
-        input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
-                 [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-        target = [[2, 1], [0, 2]]
-        loss = np.zeros((N, d1))
-        for n in range(N):
-            for d_1 in range(d1):
-                c = target[n][d_1]
-                loss[n][d_1] = -input[n][c][d_1]
-        // print(loss)
-        // [[-3. -2.]
-        //  [-0. -2.]]
-    Example 2:
-        // weighted negative log likelihood loss, sum reduction
-        N, C, d1 = 2, 3, 2
-        input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+
+    ::
+
+       // negative log likelihood loss, "none" reduction
+       N, C, d1 = 2, 3, 2
+       input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
                 [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-        target = [[2, 1], [0, 2]]
-        weight = [0.2, 0.3, 0.1]
-        loss = np.zeros((N, d1))
-        for n in range(N):
-            for d_1 in range(d1):
-                c = target[n][d_1]
-                loss[n][d_1] = -input[n][c][d_1] * weight[c]
-        loss = np.sum(loss)
-        // print(loss)
-        // -1.1
+       target = [[2, 1], [0, 2]]
+
+       loss = np.zeros((N, d1))
+       for n in range(N):
+           for d_1 in range(d1):
+               c = target[n][d_1]
+               loss[n][d_1] = -input[n][c][d_1]
+
+       // print(loss)
+       // [[-3. -2.]
+       //  [-0. -2.]]
+
+    Example 2:
+
+    ::
+
+       // weighted negative log likelihood loss, sum reduction
+       N, C, d1 = 2, 3, 2
+       input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+               [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+       target = [[2, 1], [0, 2]]
+       weight = [0.2, 0.3, 0.1]
+       loss = np.zeros((N, d1))
+       for n in range(N):
+           for d_1 in range(d1):
+               c = target[n][d_1]
+               loss[n][d_1] = -input[n][c][d_1] * weight[c]
+
+       loss = np.sum(loss)
+       // print(loss)
+       // -1.1
+
     Example 3:
-        // weighted negative log likelihood loss, mean reduction
-        N, C, d1 = 2, 3, 2
-        input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
-                [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-        target = [[2, 1], [0, 2]]
-        weight = [0.2, 0.3, 0.1]
-        loss = np.zeros((N, d1))
-        weight_total = 0
-        for n in range(N):
-            for d_1 in range(d1):
-                c = target[n][d_1]
-                loss[n][d_1] = -input[n][c][d_1] * weight[c]
-                weight_total = weight_total + weight[c]
-        loss = np.sum(loss) / weight_total
-        // print(loss)
-        // -1.57
+
+    ::
+
+       // weighted negative log likelihood loss, mean reduction
+       N, C, d1 = 2, 3, 2
+       input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+               [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+       target = [[2, 1], [0, 2]]
+       weight = [0.2, 0.3, 0.1]
+       loss = np.zeros((N, d1))
+       weight_total = 0
+       for n in range(N):
+           for d_1 in range(d1):
+               c = target[n][d_1]
+               loss[n][d_1] = -input[n][c][d_1] * weight[c]
+               weight_total = weight_total + weight[c]
+
+       loss = np.sum(loss) / weight_total
+       // print(loss)
+       // -1.57
 
     Parameters
     ==========
@@ -8836,16 +9896,24 @@ def negative_log_likelihood_loss(
         Input tensor of shape (N, C) or (N, C, d1, d2, ..., dk).
     target
         Type Tind.
-        Target tensor of shape (N) or (N, d1, d2, ..., dk). Target element value shall be in range of [0, C). If ignore_index is specified, it may have a value outside [0, C) and the target values should either be in the range [0, C) or have the value ignore_index.
+        Target tensor of shape (N) or (N, d1, d2, ..., dk). Target element value
+        shall be in range of [0, C). If ignore_index is specified, it may have a
+        value outside [0, C) and the target values should either be in the range
+        [0, C) or have the value ignore_index.
     weight
         Type T.
-        Optional rescaling weight tensor. If given, it has to be a tensor of size C. Otherwise, it is treated as if having all ones.
+        Optional rescaling weight tensor. If given, it has to be a tensor of
+        size C. Otherwise, it is treated as if having all ones.
     ignore_index
         Attribute.
-        Specifies a target value that is ignored and does not contribute to the input gradient. It's an optional value.
+        Specifies a target value that is ignored and does not contribute to the
+        input gradient. It's an optional value.
     reduction
         Attribute.
-        Type of reduction to apply to loss: none, sum, mean (default). 'none': the output is the loss for each sample. 'sum': the output will be summed. 'mean': the sum of the output will be divided by the sum of applied weights.
+        Type of reduction to apply to loss: none, sum, mean (default). 'none':
+        the output is the loss for each sample. 'sum': the output will be
+        summed. 'mean': the sum of the output will be divided by the sum of
+        applied weights.
 
     Returns
     =======
@@ -8884,40 +9952,56 @@ def non_max_suppression(
     center_point_box: int = 0,
 ) -> Var:
     r"""
-    Filter out boxes that have high intersection-over-union (IOU) overlap with previously selected boxes.
-    Bounding boxes with score less than score_threshold are removed. Bounding box format is indicated by attribute center_point_box.
-    Note that this algorithm is agnostic to where the origin is in the coordinate system and more generally is invariant to
-    orthogonal transformations and translations of the coordinate system; thus translating or reflections of the coordinate system
-    result in the same boxes being selected by the algorithm.
-    The selected_indices output is a set of integers indexing into the input collection of bounding boxes representing the selected boxes.
-    The bounding box coordinates corresponding to the selected indices can then be obtained using the Gather or GatherND operation.
+    Filter out boxes that have high intersection-over-union (IOU) overlap
+    with previously selected boxes. Bounding boxes with score less than
+    score_threshold are removed. Bounding box format is indicated by
+    attribute center_point_box. Note that this algorithm is agnostic to
+    where the origin is in the coordinate system and more generally is
+    invariant to orthogonal transformations and translations of the
+    coordinate system; thus translating or reflections of the coordinate
+    system result in the same boxes being selected by the algorithm. The
+    selected_indices output is a set of integers indexing into the input
+    collection of bounding boxes representing the selected boxes. The
+    bounding box coordinates corresponding to the selected indices can then
+    be obtained using the Gather or GatherND operation.
 
     Parameters
     ==========
     boxes
         Type tensor(float).
-        An input tensor with shape [num_batches, spatial_dimension, 4]. The single box data format is indicated by center_point_box.
+        An input tensor with shape [num_batches, spatial_dimension, 4]. The
+        single box data format is indicated by center_point_box.
     scores
         Type tensor(float).
         An input tensor with shape [num_batches, num_classes, spatial_dimension]
     max_output_boxes_per_class
         Type tensor(int64).
-        Integer representing the maximum number of boxes to be selected per batch per class. It is a scalar. Default to 0, which means no output.
+        Integer representing the maximum number of boxes to be selected per
+        batch per class. It is a scalar. Default to 0, which means no output.
     iou_threshold
         Type tensor(float).
-        Float representing the threshold for deciding whether boxes overlap too much with respect to IOU. It is scalar. Value range [0, 1]. Default to 0.
+        Float representing the threshold for deciding whether boxes overlap too
+        much with respect to IOU. It is scalar. Value range [0, 1]. Default to
+        0.
     score_threshold
         Type tensor(float).
-        Float representing the threshold for deciding when to remove boxes based on score. It is a scalar.
+        Float representing the threshold for deciding when to remove boxes based
+        on score. It is a scalar.
     center_point_box
         Attribute.
-        Integer indicate the format of the box data. The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair of box corners and the coordinates can be provided as normalized (i.e., lying in the interval [0, 1]) or absolute. Mostly used for TF models. 1 - the box data is supplied as [x_center, y_center, width, height]. Mostly used for Pytorch models.
+        Integer indicate the format of the box data. The default is 0. 0 - the
+        box data is supplied as [y1, x1, y2, x2] where (y1, x1) and (y2, x2) are
+        the coordinates of any diagonal pair of box corners and the coordinates
+        can be provided as normalized (i.e., lying in the interval [0, 1]) or
+        absolute. Mostly used for TF models. 1 - the box data is supplied as
+        [x_center, y_center, width, height]. Mostly used for Pytorch models.
 
     Returns
     =======
     selected_indices : Var
         Type tensor(int64).
-        selected indices from the boxes tensor. [num_selected_indices, 3], the selected index format is [batch_index, class_index, box_index].
+        selected indices from the boxes tensor. [num_selected_indices, 3], the
+        selected index format is [batch_index, class_index, box_index].
 
     Notes
     =====
@@ -8942,11 +10026,11 @@ def non_zero(
     X: Var,
 ) -> Var:
     r"""
-    Returns the indices of the elements that are non-zero
-        (in row-major order - by dimension).
-        NonZero behaves similar to numpy.nonzero:
-        https://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html,
-        but for scalar input, NonZero produces output shape (0, N) instead of (1, N), which is different from Numpy's behavior.
+    Returns the indices of the elements that are non-zero (in row-major
+    order - by dimension). NonZero behaves similar to numpy.nonzero:
+    https://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html,
+    but for scalar input, NonZero produces output shape (0, N) instead of
+    (1, N), which is different from Numpy's behavior.
 
     Parameters
     ==========
@@ -9016,43 +10100,68 @@ def one_hot(
     axis: int = -1,
 ) -> Var:
     r"""
-    Produces a one-hot tensor based on inputs.
-        The locations represented by the index values in the 'indices' input tensor will have 'on_value'
-        and the other locations will have 'off_value' in the output tensor, where 'on_value' and 'off_value'
-        are specified as part of required input argument 'values', which is a two-element tensor of format
-        [off_value, on_value]. The rank of the output tensor will be one greater than the rank of the
-        input tensor. The additional dimension is for one-hot representation. The additional dimension will
-        be inserted at the position specified by 'axis'. If 'axis' is not specified then then additional
-        dimension will be inserted as the innermost dimension, i.e. axis=-1. The size of the additional
-        dimension is specified by required scalar input 'depth'. The type of the output tensor is the same
-        as the type of the 'values' input. Any entries in the 'indices' input tensor with values outside
-        the range [-depth, depth-1] will result in one-hot representation with all 'off_value' values in the
-        output tensor.
-        when axis = 0:
-        output[input[i, j, k], i, j, k] = 1 for all i, j, k and 0 otherwise.
-        when axis = -1:
-        output[i, j, k, input[i, j, k]] = 1 for all i, j, k and 0 otherwise.
+    Produces a one-hot tensor based on inputs. The locations represented by
+    the index values in the 'indices' input tensor will have 'on_value' and
+    the other locations will have 'off_value' in the output tensor, where
+    'on_value' and 'off_value' are specified as part of required input
+    argument 'values', which is a two-element tensor of format [off_value,
+    on_value]. The rank of the output tensor will be one greater than the
+    rank of the input tensor. The additional dimension is for one-hot
+    representation. The additional dimension will be inserted at the
+    position specified by 'axis'. If 'axis' is not specified then then
+    additional dimension will be inserted as the innermost dimension, i.e.
+    axis=-1. The size of the additional dimension is specified by required
+    scalar input 'depth'. The type of the output tensor is the same as the
+    type of the 'values' input. Any entries in the 'indices' input tensor
+    with values outside the range [-depth, depth-1] will result in one-hot
+    representation with all 'off_value' values in the output tensor.
+
+    ::
+
+       when axis = 0:
+       output[input[i, j, k], i, j, k] = 1 for all i, j, k and 0 otherwise.
+
+       when axis = -1:
+       output[i, j, k, input[i, j, k]] = 1 for all i, j, k and 0 otherwise.
 
     Parameters
     ==========
     indices
         Type T1.
-        Input tensor containing indices. Any entries in the 'indices' input tensor with values outside the range [-depth, depth-1] will result in one-hot representation with all 'off_value' values in the output tensor.In case 'indices' is of non-integer type, the values will be casted to int64 before use.
+        Input tensor containing indices. Any entries in the 'indices' input
+        tensor with values outside the range [-depth, depth-1] will result in
+        one-hot representation with all 'off_value' values in the output
+        tensor.In case 'indices' is of non-integer type, the values will be
+        casted to int64 before use.
     depth
         Type T2.
-        Scalar specifying the number of classes in one-hot tensor. This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. In case 'depth' is of non-integer type, it will be casted to int64 before use.
+        Scalar specifying the number of classes in one-hot tensor. This is also
+        the size of the one-hot dimension (specified by 'axis' attribute) added
+        on in the output tensor. The values in the 'indices' input tensor are
+        expected to be in the range [-depth, depth-1]. In case 'depth' is of
+        non-integer type, it will be casted to int64 before use.
     values
         Type T3.
-        Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], where 'on_value' is the value used for filling locations specified in 'indices' input tensor, and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor.
+        Rank 1 tensor containing exactly two elements, in the format [off_value,
+        on_value], where 'on_value' is the value used for filling locations
+        specified in 'indices' input tensor, and 'off_value' is the value used
+        for filling locations other than those specified in 'indices' input
+        tensor.
     axis
         Attribute.
-        (Optional) Axis along which one-hot representation in added. Default: axis=-1. axis=-1 means that the additional dimension will be inserted as the innermost/last dimension in the output tensor. Negative value means counting dimensions from the back. Accepted range is [-r-1, r] where r = rank(indices).
+        (Optional) Axis along which one-hot representation in added. Default:
+        axis=-1. axis=-1 means that the additional dimension will be inserted as
+        the innermost/last dimension in the output tensor. Negative value means
+        counting dimensions from the back. Accepted range is [-r-1, r] where r =
+        rank(indices).
 
     Returns
     =======
     output : Var
         Type T3.
-        Tensor of rank one greater than input tensor 'indices', i.e. rank(output) = rank(indices) + 1. The data type for the elements of the output tensor is the same as the type of input 'values' is used.
+        Tensor of rank one greater than input tensor 'indices', i.e.
+        rank(output) = rank(indices) + 1. The data type for the elements of the
+        output tensor is the same as the type of input 'values' is used.
 
     Notes
     =====
@@ -9081,8 +10190,9 @@ def optional(
     type: Optional[Type] = None,
 ) -> Var:
     r"""
-    Constructs an optional-type value containing either an empty optional of a certain type specified by the attribute,
-    or a non-empty value containing the input element.
+    Constructs an optional-type value containing either an empty optional of
+    a certain type specified by the attribute, or a non-empty value
+    containing the input element.
 
     Parameters
     ==========
@@ -9121,8 +10231,9 @@ def optional_get_element(
     input: Var,
 ) -> Var:
     r"""
-    Outputs the element in the optional-type input. It is an error if the input value does not have an element
-    and the behavior is undefined in this case.
+    Outputs the element in the optional-type input. It is an error if the
+    input value does not have an element and the behavior is undefined in
+    this case.
 
     Parameters
     ==========
@@ -9156,7 +10267,8 @@ def optional_has_element(
     input: Var,
 ) -> Var:
     r"""
-    Returns true if the optional-type input contains an element. If it is an empty optional-type, this op returns false.
+    Returns true if the optional-type input contains an element. If it is an
+    empty optional-type, this op returns false.
 
     Parameters
     ==========
@@ -9168,7 +10280,8 @@ def optional_has_element(
     =======
     output : Var
         Type B.
-        A scalar boolean tensor. If true, it indicates that optional-type input contains an element. Otherwise, it is empty.
+        A scalar boolean tensor. If true, it indicates that optional-type input
+        contains an element. Otherwise, it is empty.
 
     Notes
     =====
@@ -9191,9 +10304,13 @@ def or_(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `or` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``or`` logical operation
+    elementwise on the input tensors ``A`` and ``B`` (with Numpy-style
+    broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -9232,12 +10349,18 @@ def prelu(
     slope: Var,
 ) -> Var:
     r"""
-    PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
-    output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
-    `f(x) = x for x >= 0`., is applied to the data tensor elementwise.
+    PRelu takes input data (Tensor<T>) and slope tensor as input, and
+    produces one output data (Tensor<T>) where the function
+    ``f(x) = slope * x for x < 0``, ``f(x) = x for x >= 0``., is applied to
+    the data tensor elementwise.
+
     **History**
-    - Version 16 adds bfloat16 to the types allowed.
-    This operator supports **unidirectional broadcasting** (tensor slope should be unidirectional broadcastable to input tensor X); for more details please check the doc (Broadcasting.md).
+
+    -  Version 16 adds bfloat16 to the types allowed. This operator supports
+       **unidirectional broadcasting** (tensor slope should be
+       unidirectional broadcastable to input tensor X); for more details
+       please check `the
+       doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -9246,7 +10369,8 @@ def prelu(
         Input tensor
     slope
         Type T.
-        Slope tensor. The shape of slope can be smaller then first input X; if so, its shape must be unidirectional broadcastable to X
+        Slope tensor. The shape of slope can be smaller then first input X; if
+        so, its shape must be unidirectional broadcastable to X
 
     Returns
     =======
@@ -9278,59 +10402,56 @@ def pad(
     mode: str = "constant",
 ) -> Var:
     r"""
-    Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
-    a padded tensor (`output`) is generated.
-    The three supported `modes` are (similar to corresponding modes supported by `numpy.pad`):
-    1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0, empty string, or False)
-    2) `reflect` - pads with the reflection of the vector mirrored on the first and last values of the vector along each axis
-    3) `edge` - pads with the edge values of array
-    Example 1 (`constant` mode):
-      Insert 0 pads to the beginning of the second dimension.
-      data =
-      [
-          [1.0, 1.2],
-          [2.3, 3.4],
-          [4.5, 5.7],
-      ]
-      pads = [0, 2, 0, 0]
-      mode = 'constant'
-      constant_value = 0.0
-      output =
-      [
-          [0.0, 0.0, 1.0, 1.2],
-          [0.0, 0.0, 2.3, 3.4],
-          [0.0, 0.0, 4.5, 5.7],
-      ]
-    Example 2 (`reflect` mode):
-      data =
-      [
-          [1.0, 1.2],
-          [2.3, 3.4],
-          [4.5, 5.7],
-      ]
-      pads = [0, 2, 0, 0]
-      mode = 'reflect'
-      output =
-      [
-          [1.0, 1.2, 1.0, 1.2],
-          [2.3, 3.4, 2.3, 3.4],
-          [4.5, 5.7, 4.5, 5.7],
-      ]
-    Example 3 (`edge` mode):
-      data =
-      [
-          [1.0, 1.2],
-          [2.3, 3.4],
-          [4.5, 5.7],
-      ]
-      pads = [0, 2, 0, 0]
-      mode = 'edge'
-      output =
-      [
-          [1.0, 1.0, 1.0, 1.2],
-          [2.3, 2.3, 2.3, 3.4],
-          [4.5, 4.5, 4.5, 5.7],
-      ]
+    Given a tensor containing the data to be padded (``data``), a tensor
+    containing the number of start and end pad values for axis (``pads``),
+    (optionally) a ``mode``, and (optionally) ``constant_value``, a padded
+    tensor (``output``) is generated.
+
+    The three supported ``modes`` are (similar to corresponding modes
+    supported by ``numpy.pad``):
+
+    1) ``constant``\ (default) - pads with a given constant value as
+       specified by ``constant_value`` (which defaults to 0, empty string,
+       or False)
+
+    2) ``reflect`` - pads with the reflection of the vector mirrored on the
+       first and last values of the vector along each axis
+
+    3) ``edge`` - pads with the edge values of array
+
+    Example 1 (``constant`` mode): Insert 0 pads to the beginning of the
+    second dimension.
+
+    data = [ [1.0, 1.2], [2.3, 3.4], [4.5, 5.7], ]
+
+    pads = [0, 2, 0, 0]
+
+    mode = 'constant'
+
+    constant_value = 0.0
+
+    output = [ [0.0, 0.0, 1.0, 1.2], [0.0, 0.0, 2.3, 3.4], [0.0, 0.0, 4.5,
+    5.7], ]
+
+    Example 2 (``reflect`` mode): data = [ [1.0, 1.2], [2.3, 3.4], [4.5,
+    5.7], ]
+
+    pads = [0, 2, 0, 0]
+
+    mode = 'reflect'
+
+    output = [ [1.0, 1.2, 1.0, 1.2], [2.3, 3.4, 2.3, 3.4], [4.5, 5.7, 4.5,
+    5.7], ]
+
+    Example 3 (``edge`` mode): data = [ [1.0, 1.2], [2.3, 3.4], [4.5, 5.7],
+    ]
+
+    pads = [0, 2, 0, 0]
+
+    mode = 'edge'
+
+    output = [ [1.0, 1.0, 1.0, 1.2], [2.3, 2.3, 2.3, 3.4], [4.5, 4.5, 4.5,
+    5.7], ]
 
     Parameters
     ==========
@@ -9339,13 +10460,20 @@ def pad(
         Input tensor.
     pads
         Type tensor(int64).
-        Tensor of integers indicating the number of padding elements to add or remove (if negative) at the beginning and end of each axis. For 2D input tensor, it is the number of pixels. `pads` should be a 1D tensor of shape [2 * input_rank]. `pads` format should be: [x1_begin, x2_begin,...,x1_end, x2_end,...], where xi_begin is the number of pad values added at the beginning of axis `i` and xi_end, the number of pad values added at the end of axis `i`.
+        Tensor of integers indicating the number of padding elements to add or
+        remove (if negative) at the beginning and end of each axis. For 2D input
+        tensor, it is the number of pixels. ``pads`` should be a 1D tensor of
+        shape [2 \* input_rank]. ``pads`` format should be: [x1_begin,
+        x2_begin,...,x1_end, x2_end,...], where xi_begin is the number of pad
+        values added at the beginning of axis ``i`` and xi_end, the number of
+        pad values added at the end of axis ``i``.
     constant_value
         Type T.
-        (Optional) A scalar value to be used if the mode chosen is `constant` (by default it is 0, empty string or False).
+        (Optional) A scalar value to be used if the mode chosen is ``constant``
+        (by default it is 0, empty string or False).
     mode
         Attribute.
-        Supported modes: `constant`(default), `reflect`, `edge`
+        Supported modes: ``constant``\ (default), ``reflect``, ``edge``
 
     Returns
     =======
@@ -9377,10 +10505,12 @@ def pow(
     Y: Var,
 ) -> Var:
     r"""
-    Pow takes input data (Tensor<T>) and exponent Tensor, and
-    produces one output data (Tensor<T>) where the function `f(x) = x^exponent`,
-    is applied to the data tensor elementwise.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Pow takes input data (Tensor<T>) and exponent Tensor, and produces one
+    output data (Tensor<T>) where the function ``f(x) = x^exponent``, is
+    applied to the data tensor elementwise. This operator supports
+    **multidirectional (i.e., Numpy-style) broadcasting**; for more details
+    please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -9433,67 +10563,116 @@ def qlinear_conv(
     strides: Optional[Iterable[int]] = None,
 ) -> Var:
     r"""
-    The convolution operator consumes a quantized input tensor, its scale and zero point,
-    a quantized filter, its scale and zero point, and output's scale and zero point,
-    and computes the quantized output. Each scale and zero-point pair must have same shape.
-    It means they must be either scalars (per tensor) or 1-D tensors (per output channel).
-    Each input or output and its related zero point must have same type.
-    When bias is present it must be quantized using scale = input scale * weight scale and
-    zero point as 0.
+    The convolution operator consumes a quantized input tensor, its scale
+    and zero point, a quantized filter, its scale and zero point, and
+    output's scale and zero point, and computes the quantized output. Each
+    scale and zero-point pair must have same shape. It means they must be
+    either scalars (per tensor) or 1-D tensors (per output channel). Each
+    input or output and its related zero point must have same type. When
+    bias is present it must be quantized using scale = input scale \* weight
+    scale and zero point as 0.
 
     Parameters
     ==========
     x
         Type T1.
-        Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in effect, the operation expects input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].
+        Input data tensor from previous layer; has size (N x C x H x W), where N
+        is the batch size, C is the number of channels, and H and W are the
+        height and width. Note that this is for the 2D image. Otherwise the size
+        is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in
+        effect, the operation expects input data tensor to arrive with the
+        dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE,
+        DATA_FEATURE ...].
     x_scale
         Type tensor(float).
-        Scale tensor for input 'x'. It's a scalar, which means a per-tensor/layer quantization.
+        Scale tensor for input 'x'. It's a scalar, which means a
+        per-tensor/layer quantization.
     x_zero_point
         Type T1.
-        Zero point tensor for input 'x'. It's a scalar, which means a per-tensor/layer quantization.
+        Zero point tensor for input 'x'. It's a scalar, which means a
+        per-tensor/layer quantization.
     w
         Type T2.
-        The weight tensor that will be used in the convolutions; has size (M x C/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel. Optionally, if dimension denotation is in effect, the operation expects the weight tensor to arrive with the dimension denotation of [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. X.shape[1] == (W.shape[1] * group) == C (assuming zero based indices for the shape array). Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL.
+        The weight tensor that will be used in the convolutions; has size (M x
+        C/group x kH x kW), where C is the number of channels, and kH and kW are
+        the height and width of the kernel, and M is the number of feature maps.
+        For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x
+        k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel.
+        Optionally, if dimension denotation is in effect, the operation expects
+        the weight tensor to arrive with the dimension denotation of
+        [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL
+        ...]. X.shape[1] == (W.shape[1] \* group) == C (assuming zero based
+        indices for the shape array). Or in other words FILTER_IN_CHANNEL should
+        be equal to DATA_CHANNEL.
     w_scale
         Type tensor(float).
-        Scale tensor for input 'w'. It could be a scalar or a 1-D tensor, which means a per-tensor/layer or per output channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of output channels (M).
+        Scale tensor for input 'w'. It could be a scalar or a 1-D tensor, which
+        means a per-tensor/layer or per output channel quantization. If it's a
+        1-D tensor, its number of elements should be equal to the number of
+        output channels (M).
     w_zero_point
         Type T2.
-        Zero point tensor for input 'w'. It could be a scalar or a 1-D tensor, which means a per-tensor/layer or per output channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of output channels (M).
+        Zero point tensor for input 'w'. It could be a scalar or a 1-D tensor,
+        which means a per-tensor/layer or per output channel quantization. If
+        it's a 1-D tensor, its number of elements should be equal to the number
+        of output channels (M).
     y_scale
         Type tensor(float).
-        Scale tensor for output 'y'. It's a scalar, which means a per-tensor/layer quantization.
+        Scale tensor for output 'y'. It's a scalar, which means a
+        per-tensor/layer quantization.
     y_zero_point
         Type T3.
-        Zero point tensor for output 'y'. It's a scalar, which means a per-tensor/layer quantization.
+        Zero point tensor for output 'y'. It's a scalar, which means a
+        per-tensor/layer quantization.
     B
         Type T4.
-        Optional 1D bias to be added to the convolution, has size of M. Bias must be quantized using scale = x_scale * w_scale and zero_point = 0
+        Optional 1D bias to be added to the convolution, has size of M. Bias
+        must be quantized using scale = x_scale \* w_scale and zero_point = 0
     auto_pad
         Attribute.
-        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where
+        default value is NOTSET, which means explicit padding is used.
+        SAME_UPPER or SAME_LOWER mean pad the input so that
+        ``output_shape[i] = ceil(input_shape[i] / strides[i])`` for each axis
+        ``i``. The padding is split between the two sides equally or almost
+        equally (depending on whether it is even or odd). In case the padding is
+        an odd number, the extra padding is added at the end for SAME_UPPER and
+        at the beginning for SAME_LOWER.
     dilations
         Attribute.
-        dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each spatial axis.
+        dilation value along each spatial axis of the filter. If not present,
+        the dilation defaults to 1 along each spatial axis.
     group
         Attribute.
-        number of groups input channels and output channels are divided into. default is 1.
+        number of groups input channels and output channels are divided into.
+        default is 1.
     kernel_shape
         Attribute.
-        The shape of the convolution kernel. If not present, should be inferred from input 'w'.
+        The shape of the convolution kernel. If not present, should be inferred
+        from input 'w'.
     pads
         Attribute.
-        Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0.The value represent the number of pixels added to the beginning and end part of the corresponding axis.`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number ofpixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`.This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaultsto 0 along start and end of each spatial axis.
+        Padding for the beginning and ending along each spatial axis, it can
+        take any value greater than or equal to 0.The value represent the number
+        of pixels added to the beginning and end part of the corresponding
+        axis.\ ``pads`` format should be as follow [x1_begin, x2_begin...x1_end,
+        x2_end,...], where xi_begin the number ofpixels added at the beginning
+        of axis ``i`` and xi_end, the number of pixels added at the end of axis
+        ``i``.This attribute cannot be used simultaneously with auto_pad
+        attribute. If not present, the padding defaultsto 0 along start and end
+        of each spatial axis.
     strides
         Attribute.
-        Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.
+        Stride along each spatial axis. If not present, the stride defaults to 1
+        along each spatial axis.
 
     Returns
     =======
     y : Var
         Type T3.
-        Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, and pad lengths.
+        Output data tensor that contains the result of the convolution. The
+        output dimensions are functions of the kernel size, stride size, and pad
+        lengths.
 
     Notes
     =====
@@ -9539,17 +10718,24 @@ def qlinear_mat_mul(
     y_zero_point: Var,
 ) -> Var:
     r"""
-    Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
-    It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
-    and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
-    For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
-    Scale and zero point must have same shape. They must be either scalar (per tensor) or N-D tensor
-    (per row for 'a' and per column for 'b'). Scalar refers to per tensor quantization whereas N-D refers to per row
-    or per column quantization. If the input is 2D of shape [M, K] then zero point and scale tensor may be
-    an M element vector [v_1, v_2, ..., v_M] for per row quantization and K element vector of shape [v_1, v_2, ..., v_K]
-    for per column quantization. If the input is N-D tensor with shape [D1, D2, M, K] then zero point and scale tensor may
-    have shape [D1, D2, M, 1] for per row quantization and shape [D1, D2, 1, K] for per column quantization.
-    Production must never overflow, and accumulation may overflow if and only if in 32 bits.
+    Matrix product that behaves like numpy.matmul:
+    https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+    It consumes two quantized input tensors, their scales and zero points,
+    scale and zero point of output, and computes the quantized output. The
+    quantization formula is y = saturate((x / y_scale) + y_zero_point). For
+    (x / y_scale), it is rounding to nearest ties to even. Refer to
+    https://en.wikipedia.org/wiki/Rounding for details. Scale and zero point
+    must have same shape. They must be either scalar (per tensor) or N-D
+    tensor (per row for 'a' and per column for 'b'). Scalar refers to per
+    tensor quantization whereas N-D refers to per row or per column
+    quantization. If the input is 2D of shape [M, K] then zero point and
+    scale tensor may be an M element vector [v_1, v_2, ..., v_M] for per row
+    quantization and K element vector of shape [v_1, v_2, ..., v_K] for per
+    column quantization. If the input is N-D tensor with shape [D1, D2, M,
+    K] then zero point and scale tensor may have shape [D1, D2, M, 1] for
+    per row quantization and shape [D1, D2, 1, K] for per column
+    quantization. Production must never overflow, and accumulation may
+    overflow if and only if in 32 bits.
 
     Parameters
     ==========
@@ -9582,7 +10768,7 @@ def qlinear_mat_mul(
     =======
     y : Var
         Type T3.
-        Quantized matrix multiply results from a * b
+        Quantized matrix multiply results from a \* b
 
     Notes
     =====
@@ -9616,11 +10802,16 @@ def quantize_linear(
     axis: int = 1,
 ) -> Var:
     r"""
-    The linear quantization operator. It consumes a high precision tensor, a scale, and a zero point to compute the low precision / quantized tensor.
-    The scale factor and zero point must have same shape, and can be either a scalar for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
-    The quantization formula is y = saturate ((x / y_scale) + y_zero_point).
-    For saturation, it saturates to [0, 255] if it's uint8, or [-128, 127] if it's int8.
-    For (x / y_scale), it's rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details. 'y_zero_point' and 'y' must have same type.
+    The linear quantization operator. It consumes a high precision tensor, a
+    scale, and a zero point to compute the low precision / quantized tensor.
+    The scale factor and zero point must have same shape, and can be either
+    a scalar for per-tensor / per layer quantization, or a 1-D tensor for
+    per-axis quantization. The quantization formula is y = saturate ((x /
+    y_scale) + y_zero_point). For saturation, it saturates to [0, 255] if
+    it's uint8, or [-128, 127] if it's int8. For (x / y_scale), it's
+    rounding to nearest ties to even. Refer to
+    https://en.wikipedia.org/wiki/Rounding for details. 'y_zero_point' and
+    'y' must have same type.
 
     Parameters
     ==========
@@ -9629,13 +10820,19 @@ def quantize_linear(
         N-D full precision Input tensor to be quantized.
     y_scale
         Type tensor(float).
-        Scale for doing quantization to get 'y'. It can be a scalar, which means per-tensor/layer quantization, or a 1-D Tensor for per-axis quantization.
+        Scale for doing quantization to get 'y'. It can be a scalar, which means
+        per-tensor/layer quantization, or a 1-D Tensor for per-axis
+        quantization.
     y_zero_point
         Type T2.
-        Zero point for doing quantization to get 'y'. Shape must match y_scale. Default is uint8 with zero point of 0 if it's not specified.
+        Zero point for doing quantization to get 'y'. Shape must match y_scale.
+        Default is uint8 with zero point of 0 if it's not specified.
     axis
         Attribute.
-        (Optional) The axis of the quantization dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        (Optional) The axis of the quantization dimension of the input tensor.
+        Ignored for per-tensor quantization. Negative value means counting
+        dimensions from the back. Accepted range is [-r, r-1] where r =
+        rank(input).
 
     Returns
     =======
@@ -9680,89 +10877,153 @@ def rnn(
     layout: int = 0,
 ) -> Tuple[Var, Var]:
     r"""
-    Computes an one-layer simple RNN. This operator is usually supported
-    via some custom implementation such as CuDNN.
+    Computes an one-layer simple RNN. This operator is usually supported via
+    some custom implementation such as CuDNN.
+
     Notations:
-    `X` - input tensor
-    `i` - input gate
-    `t` - time step (t-1 means previous time step)
-    `Wi` - W parameter weight matrix for input gate
-    `Ri` - R recurrence weight matrix for input gate
-    `Wbi` - W parameter bias vector for input gate
-    `Rbi` - R parameter bias vector for input gate
-    `WBi` - W parameter weight matrix for backward input gate
-    `RBi` - R recurrence weight matrix for backward input gate
-    `WBbi` - WR bias vectors for backward input gate
-    `RBbi` - RR bias vectors for backward input gate
-    `H` - Hidden state
-    `num_directions` - 2 if direction == bidirectional else 1
+
+    ``X`` - input tensor
+
+    ``i`` - input gate
+
+    ``t`` - time step (t-1 means previous time step)
+
+    ``Wi`` - W parameter weight matrix for input gate
+
+    ``Ri`` - R recurrence weight matrix for input gate
+
+    ``Wbi`` - W parameter bias vector for input gate
+
+    ``Rbi`` - R parameter bias vector for input gate
+
+    ``WBi`` - W parameter weight matrix for backward input gate
+
+    ``RBi`` - R recurrence weight matrix for backward input gate
+
+    ``WBbi`` - WR bias vectors for backward input gate
+
+    ``RBbi`` - RR bias vectors for backward input gate
+
+    ``H`` - Hidden state
+
+    ``num_directions`` - 2 if direction == bidirectional else 1
+
     Activation functions:
-      Relu(x)                - max(0, x)
-      Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
-      Sigmoid(x)             - 1/(1 + e^{-x})
-      (NOTE: Below are optional)
-      Affine(x)              - alpha*x + beta
-      LeakyRelu(x)           - x if x >= 0 else alpha * x
-      ThresholdedRelu(x)     - x if x >= alpha else 0
-      ScaledTanh(x)          - alpha*Tanh(beta*x)
-      HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
-      Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
-      Softsign(x)            - x/(1 + |x|)
-      Softplus(x)            - log(1 + e^x)
+
+    Relu(x) - max(0, x)
+
+    Tanh(x) - (1 - e^{-2x})/(1 + e^{-2x})
+
+    Sigmoid(x) - 1/(1 + e^{-x})
+
+    (NOTE: Below are optional)
+
+    Affine(x) - alpha*x + beta
+
+    LeakyRelu(x) - x if x >= 0 else alpha \* x
+
+    ThresholdedRelu(x) - x if x >= alpha else 0
+
+    ScaledTanh(x) - alpha\ *Tanh(beta*\ x)
+
+    HardSigmoid(x) - min(max(alpha*x + beta, 0), 1)
+
+    Elu(x) - x if x >= 0 else alpha*(e^x - 1)
+
+    Softsign(x) - x/(1 + \|x|)
+
+    Softplus(x) - log(1 + e^x)
+
     Equations (Default: f=Tanh):
-      - Ht = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Wbi + Rbi)
-    This operator has **optional** inputs/outputs. See the doc (IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+    -  Ht = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Wbi + Rbi) This operator has
+       **optional** inputs/outputs. See `the
+       doc <https://github.com/onnx/onnx/blob/main/docs/IR.md>`__ for more
+       details about the representation of optional arguments. An empty
+       string may be used in the place of an actual argument's name to
+       indicate a missing argument. Trailing optional arguments (those not
+       followed by an argument that is present) may also be simply omitted.
 
     Parameters
     ==========
     X
         Type T.
-        The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.
+        The input sequences packed (and potentially padded) into one 3-D tensor
+        with the shape of ``[seq_length, batch_size, input_size]``.
     W
         Type T.
-        The weight tensor for input gate. Concatenation of `Wi` and `WBi` (if bidirectional). The tensor has shape `[num_directions, hidden_size, input_size]`.
+        The weight tensor for input gate. Concatenation of ``Wi`` and ``WBi``
+        (if bidirectional). The tensor has shape
+        ``[num_directions, hidden_size, input_size]``.
     R
         Type T.
-        The recurrence weight tensor. Concatenation of `Ri` and `RBi` (if bidirectional). The tensor has shape `[num_directions, hidden_size, hidden_size]`.
+        The recurrence weight tensor. Concatenation of ``Ri`` and ``RBi`` (if
+        bidirectional). The tensor has shape
+        ``[num_directions, hidden_size, hidden_size]``.
     B
         Type T.
-        The bias tensor for input gate. Concatenation of `[Wbi, Rbi]` and `[WBbi, RBbi]` (if bidirectional). The tensor has shape `[num_directions, 2*hidden_size]`. Optional: If not specified - assumed to be 0.
+        The bias tensor for input gate. Concatenation of ``[Wbi, Rbi]`` and
+        ``[WBbi, RBbi]`` (if bidirectional). The tensor has shape
+        ``[num_directions, 2*hidden_size]``. Optional: If not specified -
+        assumed to be 0.
     sequence_lens
         Type T1.
-        Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.
+        Optional tensor specifying lengths of the sequences in a batch. If not
+        specified - assumed all sequences in the batch to have length
+        ``seq_length``. It has shape ``[batch_size]``.
     initial_h
         Type T.
-        Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.
+        Optional initial value of the hidden. If not specified - assumed to be
+        0. It has shape ``[num_directions, batch_size, hidden_size]``.
     activation_alpha
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX
+        operators.For example with LeakyRelu, the default alpha is 0.01.
     activation_beta
         Attribute.
-        Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.
+        Optional scaling values used by some activation functions. The values
+        are consumed in the order of activation functions, for example (f, g, h)
+        in LSTM. Default values are the same as of corresponding ONNX operators.
     activations
         Attribute.
-        One (or two if bidirectional) activation function for input gate. The activation function must be one of the activation functions specified above. Optional: Default `Tanh` if not specified.
+        One (or two if bidirectional) activation function for input gate. The
+        activation function must be one of the activation functions specified
+        above. Optional: Default ``Tanh`` if not specified.
     clip
         Attribute.
-        Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.
+        Cell clip threshold. Clipping bounds the elements of a tensor in the
+        range of [-threshold, +threshold] and is applied to the input of
+        activations. No clip if not specified.
     direction
         Attribute.
-        Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.
+        Specify if the RNN is forward, reverse, or bidirectional. Must be one of
+        forward (default), reverse, or bidirectional.
     hidden_size
         Attribute.
         Number of neurons in the hidden layer
     layout
         Attribute.
-        The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size, num_directions, hidden_size].
+        The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the
+        following shapes are expected: X.shape = [seq_length, batch_size,
+        input_size], Y.shape = [seq_length, num_directions, batch_size,
+        hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size,
+        hidden_size]. If 1, the following shapes are expected: X.shape =
+        [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length,
+        num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size,
+        num_directions, hidden_size].
 
     Returns
     =======
     Y : Var
         Type T.
-        A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`.
+        A tensor that concats all the intermediate output values of the hidden.
+        It has shape ``[seq_length, num_directions, batch_size, hidden_size]``.
     Y_h : Var
         Type T.
-        The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.
+        The last output value of the hidden. It has shape
+        ``[num_directions, batch_size, hidden_size]``.
 
     Notes
     =====
@@ -9806,18 +11067,21 @@ def random_normal(
     shape: Iterable[int],
 ) -> Var:
     r"""
-    Generate a tensor with random values drawn from a normal distribution. The shape
-    of the tensor is specified by the `shape` argument and the parameter of the normal distribution
-    specified by `mean` and `scale`.
-    The data type is specified by the 'dtype' argument. The 'dtype' argument must
-    be one of the data types specified in the 'DataType' enum field in the
-    TensorProto message.
+    Generate a tensor with random values drawn from a normal distribution.
+    The shape of the tensor is specified by the ``shape`` argument and the
+    parameter of the normal distribution specified by ``mean`` and
+    ``scale``.
+
+    The data type is specified by the 'dtype' argument. The 'dtype' argument
+    must be one of the data types specified in the 'DataType' enum field in
+    the TensorProto message.
 
     Parameters
     ==========
     dtype
         Attribute.
-        The data type for the elements of the output tensor. Default is TensorProto::FLOAT.
+        The data type for the elements of the output tensor. Default is
+        TensorProto::FLOAT.
     mean
         Attribute.
         The mean of the normal distribution.
@@ -9826,7 +11090,8 @@ def random_normal(
         The standard deviation of the normal distribution.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
     shape
         Attribute.
         The shape of the output tensor.
@@ -9866,11 +11131,14 @@ def random_normal_like(
 ) -> Var:
     r"""
     Generate a tensor with random values drawn from a normal distribution.
-    The shape of the output tensor is copied from the shape of the input tensor,
-    and the parameters of the normal distribution are specified by `mean` and `scale`.
-    The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
-    The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
-    TensorProto message, and be valid as an output type.
+    The shape of the output tensor is copied from the shape of the input
+    tensor, and the parameters of the normal distribution are specified by
+    ``mean`` and ``scale``.
+
+    The data type is specified by the 'dtype' argument, or copied from the
+    input tensor if not provided. The 'dtype' argument must be one of the
+    data types specified in the 'DataType' enum field in the TensorProto
+    message, and be valid as an output type.
 
     Parameters
     ==========
@@ -9879,7 +11147,8 @@ def random_normal_like(
         Input tensor to copy shape and optionally type information from.
     dtype
         Attribute.
-        (Optional) The data type for the elements of the output tensor, if not specified, we will use the data type of the input tensor.
+        (Optional) The data type for the elements of the output tensor, if not
+        specified, we will use the data type of the input tensor.
     mean
         Attribute.
         The mean of the normal distribution.
@@ -9888,7 +11157,8 @@ def random_normal_like(
         The standard deviation of the normal distribution.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
 
     Returns
     =======
@@ -9926,17 +11196,20 @@ def random_uniform(
     shape: Iterable[int],
 ) -> Var:
     r"""
-    Generate a tensor with random values drawn from a uniform distribution. The shape
-    of the tensor is specified by the `shape` argument and the range by `low` and `high`.
-    The data type is specified by the 'dtype' argument. The 'dtype' argument must
-    be one of the data types specified in the 'DataType' enum field in the
-    TensorProto message.
+    Generate a tensor with random values drawn from a uniform distribution.
+    The shape of the tensor is specified by the ``shape`` argument and the
+    range by ``low`` and ``high``.
+
+    The data type is specified by the 'dtype' argument. The 'dtype' argument
+    must be one of the data types specified in the 'DataType' enum field in
+    the TensorProto message.
 
     Parameters
     ==========
     dtype
         Attribute.
-        The data type for the elements of the output tensor. If not specified, default is TensorProto::FLOAT.
+        The data type for the elements of the output tensor. If not specified,
+        default is TensorProto::FLOAT.
     high
         Attribute.
         Upper boundary of the output values.
@@ -9945,7 +11218,8 @@ def random_uniform(
         Lower boundary of the output values.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
     shape
         Attribute.
         The shape of the output tensor.
@@ -9985,11 +11259,14 @@ def random_uniform_like(
 ) -> Var:
     r"""
     Generate a tensor with random values drawn from a uniform distribution.
-    The shape of the output tensor is copied from the shape of the input tensor,
-    and the parameters of the uniform distribution are specified by `low` and `high`.
-    The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
-    The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
-    TensorProto message and be valid as an output type.
+    The shape of the output tensor is copied from the shape of the input
+    tensor, and the parameters of the uniform distribution are specified by
+    ``low`` and ``high``.
+
+    The data type is specified by the 'dtype' argument, or copied from the
+    input tensor if not provided. The 'dtype' argument must be one of the
+    data types specified in the 'DataType' enum field in the TensorProto
+    message and be valid as an output type.
 
     Parameters
     ==========
@@ -9998,7 +11275,8 @@ def random_uniform_like(
         Input tensor to copy shape and optionally type information from.
     dtype
         Attribute.
-        (Optional) The data type for the elements of the output tensor, if not specified, we will use the data type of the input tensor.
+        (Optional) The data type for the elements of the output tensor, if not
+        specified, we will use the data type of the input tensor.
     high
         Attribute.
         Upper boundary of the output values.
@@ -10007,7 +11285,8 @@ def random_uniform_like(
         Lower boundary of the output values.
     seed
         Attribute.
-        (Optional) Seed to the random generator, if not specified we will auto generate one.
+        (Optional) Seed to the random generator, if not specified we will auto
+        generate one.
 
     Returns
     =======
@@ -10042,21 +11321,28 @@ def range(
     delta: Var,
 ) -> Var:
     r"""
-    Generate a tensor containing a sequence of numbers that begin at `start` and extends by increments of `delta`
-    up to `limit` (exclusive).
+    Generate a tensor containing a sequence of numbers that begin at
+    ``start`` and extends by increments of ``delta`` up to ``limit``
+    (exclusive).
+
     The number of elements in the output of range is computed as below-
-    `number_of_elements = max( ceil( (limit - start) / delta ) , 0 )`
+
+    ``number_of_elements = max( ceil( (limit - start) / delta ) , 0 )``
+
     The pseudocode determining the contents of the output is shown below-
-    `for(int i=0; i<number_of_elements; ++i)`
-    `{`
-    `    output[i] =  start + (i * delta);  `
-    `}`
-    `Example 1`
-    Inputs: start = 3, limit = 9, delta = 3
-    Output: [3, 6]
-    `Example 2`
-    Inputs: start = 10, limit = 4, delta = -2
-    Output: [10, 8, 6]
+
+    ``for(int i=0; i<number_of_elements; ++i)``
+
+    ``{``
+
+    ``output[i] =  start + (i * delta);``
+
+    ``}``
+
+    ``Example 1`` Inputs: start = 3, limit = 9, delta = 3 Output: [3, 6]
+
+    ``Example 2`` Inputs: start = 10, limit = 4, delta = -2 Output: [10, 8,
+    6]
 
     Parameters
     ==========
@@ -10074,7 +11360,8 @@ def range(
     =======
     output : Var
         Type T.
-        A 1-D tensor with same type as the inputs containing generated range of values.
+        A 1-D tensor with same type as the inputs containing generated range of
+        values.
 
     Notes
     =====
@@ -10098,8 +11385,8 @@ def reciprocal(
 ) -> Var:
     r"""
     Reciprocal takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the reciprocal is, y = 1/x, is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the reciprocal is, y = 1/x, is applied to the tensor
+    elementwise.
 
     Parameters
     ==========
@@ -10135,11 +11422,13 @@ def reduce_l1(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the L1 norm of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the L1 norm of the input tensor's element along the provided
+    axes. The resulting tensor has the same rank as the input if keepdims
+    equals 1. If keepdims equals 0, then the resulting tensor has the
+    reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10148,10 +11437,13 @@ def reduce_l1(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10184,11 +11476,13 @@ def reduce_l2(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the L2 norm of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the L2 norm of the input tensor's element along the provided
+    axes. The resulting tensor has the same rank as the input if keepdims
+    equals 1. If keepdims equals 0, then the resulting tensor has the
+    reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10197,10 +11491,13 @@ def reduce_l2(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10233,11 +11530,13 @@ def reduce_log_sum(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the log sum of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the log sum of the input tensor's element along the provided
+    axes. The resulting tensor has the same rank as the input if keepdims
+    equals 1. If keepdims equals 0, then the resulting tensor has the
+    reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10246,10 +11545,13 @@ def reduce_log_sum(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10282,11 +11584,13 @@ def reduce_log_sum_exp(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the log sum exponent of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the log sum exponent of the input tensor's element along the
+    provided axes. The resulting tensor has the same rank as the input if
+    keepdims equals 1. If keepdims equals 0, then the resulting tensor has
+    the reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10295,10 +11599,13 @@ def reduce_log_sum_exp(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10331,11 +11638,13 @@ def reduce_max(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the max of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the max of the input tensor's element along the provided axes.
+    The resulting tensor has the same rank as the input if keepdims equals
+    1. If keepdims equals 0, then the resulting tensor has the reduced
+    dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10344,10 +11653,13 @@ def reduce_max(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10380,11 +11692,13 @@ def reduce_mean(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the mean of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the mean of the input tensor's element along the provided axes.
+    The resulting tensor has the same rank as the input if keepdims equals
+    1. If keepdims equals 0, then the resulting tensor has the reduced
+    dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10393,10 +11707,13 @@ def reduce_mean(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10429,11 +11746,13 @@ def reduce_min(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the min of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the min of the input tensor's element along the provided axes.
+    The resulting tensor has the same rank as the input if keepdims equals
+    1. If keepdims equals 0, then the resulting tensor has the reduced
+    dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10442,10 +11761,13 @@ def reduce_min(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10478,11 +11800,13 @@ def reduce_prod(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the product of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the product of the input tensor's element along the provided
+    axes. The resulting tensor has the same rank as the input if keepdims
+    equals 1. If keepdims equals 0, then the resulting tensor has the
+    reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10491,10 +11815,13 @@ def reduce_prod(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10528,11 +11855,13 @@ def reduce_sum(
     noop_with_empty_axes: int = 0,
 ) -> Var:
     r"""
-    Computes the sum of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the sum of the input tensor's element along the provided axes.
+    The resulting tensor has the same rank as the input if keepdims equals
+    1. If keepdims equals 0, then the resulting tensor has the reduced
+    dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10541,13 +11870,21 @@ def reduce_sum(
         An input tensor.
     axes
         Type tensor(int64).
-        Optional input list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor if 'noop_with_empty_axes' is false, else act as an Identity op when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1] where r = rank(data).
+        Optional input list of integers, along which to reduce. The default is
+        to reduce over all the dimensions of the input tensor if
+        'noop_with_empty_axes' is false, else act as an Identity op when
+        'noop_with_empty_axes' is true. Accepted range is [-r, r-1] where r =
+        rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
     noop_with_empty_axes
         Attribute.
-        Defines behavior if 'axes' is empty. Default behavior with 'false' is to reduce all axes. When axes is empty and this attribute is set to true, input tensor will not be reduced,and the output tensor would be equivalent to input tensor.
+        Defines behavior if 'axes' is empty. Default behavior with 'false' is to
+        reduce all axes. When axes is empty and this attribute is set to true,
+        input tensor will not be reduced,and the output tensor would be
+        equivalent to input tensor.
 
     Returns
     =======
@@ -10581,11 +11918,13 @@ def reduce_sum_square(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Computes the sum square of the input tensor's element along the provided axes. The resulting
-    tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
-    the resulting tensor has the reduced dimension pruned.
-    The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-    False instead of True.
+    Computes the sum square of the input tensor's element along the provided
+    axes. The resulting tensor has the same rank as the input if keepdims
+    equals 1. If keepdims equals 0, then the resulting tensor has the
+    reduced dimension pruned.
+
+    The above behavior is similar to numpy, with the exception that numpy
+    defaults keepdims to False instead of True.
 
     Parameters
     ==========
@@ -10594,10 +11933,13 @@ def reduce_sum_square(
         An input tensor.
     axes
         Attribute.
-        A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor. Accepted range is [-r, r-1] where r = rank(data).
+        A list of integers, along which to reduce. The default is to reduce over
+        all the dimensions of the input tensor. Accepted range is [-r, r-1]
+        where r = rank(data).
     keepdims
         Attribute.
-        Keep the reduced dimension or not, default 1 means keep reduced dimension.
+        Keep the reduced dimension or not, default 1 means keep reduced
+        dimension.
 
     Returns
     =======
@@ -10628,8 +11970,8 @@ def relu(
 ) -> Var:
     r"""
     Relu takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the rectified linear function, y = max(0, x), is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the rectified linear function, y = max(0, x), is
+    applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -10665,18 +12007,21 @@ def reshape(
     allowzero: int = 0,
 ) -> Var:
     r"""
-    Reshape the input tensor similar to numpy.reshape.
-    First input is the data tensor, second input is a shape tensor which specifies the output shape. It outputs the reshaped tensor.
-    At most one dimension of the new shape can be -1. In this case, the value is
-    inferred from the size of the tensor and the remaining dimensions. A dimension
-    could also be 0, in which case the actual dimension value is unchanged (i.e. taken
-    from the input tensor). If 'allowzero' is set, and the new shape includes 0, the
-    dimension will be set explicitly to zero (i.e. not taken from input tensor).
-    Shape (second input) could be an empty shape, which means converting to a scalar.
-    The input tensor's shape and the output tensor's shape are required to have the same number of elements.
-    If the attribute 'allowzero' is set, it is invalid for the specified shape to
-    contain both a zero value and -1, as the value of the dimension corresponding
-    to -1 cannot be determined uniquely.
+    Reshape the input tensor similar to numpy.reshape. First input is the
+    data tensor, second input is a shape tensor which specifies the output
+    shape. It outputs the reshaped tensor. At most one dimension of the new
+    shape can be -1. In this case, the value is inferred from the size of
+    the tensor and the remaining dimensions. A dimension could also be 0, in
+    which case the actual dimension value is unchanged (i.e. taken from the
+    input tensor). If 'allowzero' is set, and the new shape includes 0, the
+    dimension will be set explicitly to zero (i.e. not taken from input
+    tensor). Shape (second input) could be an empty shape, which means
+    converting to a scalar. The input tensor's shape and the output tensor's
+    shape are required to have the same number of elements.
+
+    If the attribute 'allowzero' is set, it is invalid for the specified
+    shape to contain both a zero value and -1, as the value of the dimension
+    corresponding to -1 cannot be determined uniquely.
 
     Parameters
     ==========
@@ -10688,7 +12033,10 @@ def reshape(
         Specified shape for output.
     allowzero
         Attribute.
-        (Optional) By default, when any value in the 'shape' input is equal to zero the corresponding dimension value is copied from the input tensor dynamically. allowzero=1 indicates that if any value in the 'shape' input is set to zero, the zero value is honored, similar to NumPy.
+        (Optional) By default, when any value in the 'shape' input is equal to
+        zero the corresponding dimension value is copied from the input tensor
+        dynamically. allowzero=1 indicates that if any value in the 'shape'
+        input is set to zero, the zero value is honored, similar to NumPy.
 
     Returns
     =======
@@ -10728,9 +12076,11 @@ def resize(
     nearest_mode: str = "round_prefer_floor",
 ) -> Var:
     r"""
-    Resize the input tensor. In general, it calculates every value in the output tensor as a weighted average of neighborhood (a.k.a. sampling locations) in the input tensor.
-    Each dimension value of the output tensor is:
-      output_dimension = floor(input_dimension * (roi_end - roi_start) * scale) if input \"sizes\" is not specified.
+    Resize the input tensor. In general, it calculates every value in the
+    output tensor as a weighted average of neighborhood (a.k.a. sampling
+    locations) in the input tensor. Each dimension value of the output
+    tensor is: output_dimension = floor(input_dimension \* (roi_end -
+    roi_start) \* scale) if input "sizes" is not specified.
 
     Parameters
     ==========
@@ -10739,43 +12089,83 @@ def resize(
         N-D tensor
     roi
         Type T2.
-        1-D tensor given as [start1, ..., startN, end1, ..., endN], where N is the rank of X. The RoIs' coordinates are normalized in the coordinate system of the input image. It only takes effect when coordinate_transformation_mode is "tf_crop_and_resize"
+        1-D tensor given as [start1, ..., startN, end1, ..., endN], where N is
+        the rank of X. The RoIs' coordinates are normalized in the coordinate
+        system of the input image. It only takes effect when
+        coordinate_transformation_mode is "tf_crop_and_resize"
     scales
         Type tensor(float).
-        The scale array along each dimension. It takes value greater than 0. If it's less than 1, it's sampling down, otherwise, it's upsampling. The number of elements of 'scales' should be the same as the rank of input 'X'. One of 'scales' and 'sizes' MUST be specified and it is an error if both are specified. If 'sizes' is needed, the user can use an empty string as the name of 'scales' in this operator's input list.
+        The scale array along each dimension. It takes value greater than 0. If
+        it's less than 1, it's sampling down, otherwise, it's upsampling. The
+        number of elements of 'scales' should be the same as the rank of input
+        'X'. One of 'scales' and 'sizes' MUST be specified and it is an error if
+        both are specified. If 'sizes' is needed, the user can use an empty
+        string as the name of 'scales' in this operator's input list.
     sizes
         Type tensor(int64).
-        The size of the output tensor. The number of elements of 'sizes' should be the same as the rank of input 'X'. Only one of 'scales' and 'sizes' can be specified.
+        The size of the output tensor. The number of elements of 'sizes' should
+        be the same as the rank of input 'X'. Only one of 'scales' and 'sizes'
+        can be specified.
     coordinate_transformation_mode
         Attribute.
-        This attribute describes how to transform the coordinate in the resized tensor to the coordinate in the original tensor. <br/>
-        The coordinate of each dimension is transformed individually. Let's describe a case using axis x as an example.
-        Denote x_resized as the coordinate of axis x in the resized tensor, x_original as the coordinate of axis x in the original tensor, length_original as the length of the original tensor in axis x, length_resized as the length of the resized tensor in axis x, roi_x = (start_x, end_x) of the axis x in input "roi", scale = length_resized / length_original, <br/>
-        if coordinate_transformation_mode is "half_pixel", <br/>
-        x_original = (x_resized + 0.5) / scale - 0.5, <br/>
-        if coordinate_transformation_mode is "pytorch_half_pixel", <br/>
-        x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0, <br/>
-        if coordinate_transformation_mode is "align_corners", <br/>
-        x_original = x_resized * (length_original - 1) / (length_resized - 1), <br/>
-        if coordinate_transformation_mode is "asymmetric", <br/>
-        x_original = x_resized / scale, <br/>
-        if coordinate_transformation_mode is "tf_crop_and_resize", <br/>
-        x_original = length_resized > 1 ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1) : 0.5 * (start_x + end_x) * (length_original - 1).
+        This attribute describes how to transform the coordinate in the resized
+        tensor to the coordinate in the original tensor.
+
+        The coordinate of each dimension is transformed individually. Let's
+        describe a case using axis x as an example. Denote x_resized as the
+        coordinate of axis x in the resized tensor, x_original as the coordinate
+        of axis x in the original tensor, length_original as the length of the
+        original tensor in axis x, length_resized as the length of the resized
+        tensor in axis x, roi_x = (start_x, end_x) of the axis x in input "roi",
+        scale = length_resized / length_original,
+
+        if coordinate_transformation_mode is "half_pixel", x_original =
+        (x_resized + 0.5) / scale - 0.5,
+
+        if coordinate_transformation_mode is "pytorch_half_pixel", x_original =
+        length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0,
+
+        if coordinate_transformation_mode is "align_corners", x_original =
+        x_resized \* (length_original - 1) / (length_resized - 1),
+
+        if coordinate_transformation_mode is "asymmetric", x_original =
+        x_resized / scale,
+
+        if coordinate_transformation_mode is "tf_crop_and_resize", x_original =
+        length_resized > 1 ? start_x \* (length_original - 1) + x_resized \*
+        (end_x - start_x) \* (length_original - 1) / (length_resized - 1) : 0.5
+        \* (start_x + end_x) \* (length_original - 1).
     cubic_coeff_a
         Attribute.
-        The coefficient 'a' used in cubic interpolation. Two common choice are -0.5 (in some cases of TensorFlow) and -0.75 (in PyTorch). Check out Equation (4) in https://ieeexplore.ieee.org/document/1163711 for the details. This attribute is valid only if "mode" is "cubic".
+        The coefficient 'a' used in cubic interpolation. Two common choice are
+        -0.5 (in some cases of TensorFlow) and -0.75 (in PyTorch). Check out
+        Equation (4) in https://ieeexplore.ieee.org/document/1163711 for the
+        details. This attribute is valid only if "mode" is "cubic".
     exclude_outside
         Attribute.
-        If set to 1, the weight of sampling locations outside the tensor will be set to 0 and the weight will be renormalized so that their sum is 1.0. The default value is 0.
+        If set to 1, the weight of sampling locations outside the tensor will be
+        set to 0 and the weight will be renormalized so that their sum is 1.0.
+        The default value is 0.
     extrapolation_value
         Attribute.
-        When coordinate_transformation_mode is "tf_crop_and_resize" and x_original is outside the range [0, length_original - 1], this value is used as the corresponding output value. Default is 0.0f.
+        When coordinate_transformation_mode is "tf_crop_and_resize" and
+        x_original is outside the range [0, length_original - 1], this value is
+        used as the corresponding output value. Default is 0.0f.
     mode
         Attribute.
-        Three interpolation modes: nearest (default), linear and cubic. The "linear" mode includes linear interpolation for 1D tensor and N-linear interpolation for N-D tensor (for example, bilinear interpolation for 2D tensor). The "cubic" mode includes cubic interpolation for 1D tensor and N-cubic interpolation for N-D tensor (for example, bicubic interpolation for 2D tensor).
+        Three interpolation modes: nearest (default), linear and cubic. The
+        "linear" mode includes linear interpolation for 1D tensor and N-linear
+        interpolation for N-D tensor (for example, bilinear interpolation for 2D
+        tensor). The "cubic" mode includes cubic interpolation for 1D tensor and
+        N-cubic interpolation for N-D tensor (for example, bicubic interpolation
+        for 2D tensor).
     nearest_mode
         Attribute.
-        Four modes: round_prefer_floor (default, as known as round half down), round_prefer_ceil (as known as round half up), floor, ceil. Only used by nearest interpolation. It indicates how to get "nearest" pixel in input tensor from x_original, so this attribute is valid only if "mode" is "nearest".
+        Four modes: round_prefer_floor (default, as known as round half down),
+        round_prefer_ceil (as known as round half up), floor, ceil. Only used by
+        nearest interpolation. It indicates how to get "nearest" pixel in input
+        tensor from x_original, so this attribute is valid only if "mode" is
+        "nearest".
 
     Returns
     =======
@@ -10817,34 +12207,28 @@ def reverse_sequence(
     time_axis: int = 0,
 ) -> Var:
     r"""
-    Reverse batch of sequences having different lengths specified by `sequence_lens`.
-    For each slice i iterating on batch axis, the operator reverses the first sequence_lens[i] elements on time axis,
-    and copies elements whose index's beyond sequence_lens[i] to the output. So the output slice i contains reversed
-    sequences on the first sequence_lens[i] elements, then have original values copied for the other elements.
-    Example 1:
-      input = [[0.0, 4.0, 8.0,  12.0],
-               [1.0, 5.0, 9.0,  13.0],
-               [2.0, 6.0, 10.0, 14.0],
-               [3.0, 7.0, 11.0, 15.0]]
-      sequence_lens = [4, 3, 2, 1]
-      time_axis = 0
-      batch_axis = 1
-      output = [[3.0, 6.0, 9.0,  12.0],
-                [2.0, 5.0, 8.0,  13.0],
-                [1.0, 4.0, 10.0, 14.0],
-                [0.0, 7.0, 11.0, 15.0]]
-    Example 2:
-      input = [[0.0,  1.0,  2.0,  3.0 ],
-               [4.0,  5.0,  6.0,  7.0 ],
-               [8.0,  9.0,  10.0, 11.0],
-               [12.0, 13.0, 14.0, 15.0]]
-      sequence_lens = [1, 2, 3, 4]
-      time_axis = 1
-      batch_axis = 0
-      output = [[0.0,  1.0,  2.0,  3.0 ],
-                [5.0,  4.0,  6.0,  7.0 ],
-                [10.0, 9.0,  8.0,  11.0],
-                [15.0, 14.0, 13.0, 12.0]]
+    Reverse batch of sequences having different lengths specified by
+    ``sequence_lens``.
+
+    For each slice i iterating on batch axis, the operator reverses the
+    first sequence_lens[i] elements on time axis, and copies elements whose
+    index's beyond sequence_lens[i] to the output. So the output slice i
+    contains reversed sequences on the first sequence_lens[i] elements, then
+    have original values copied for the other elements.
+
+    Example 1: input = [[0.0, 4.0, 8.0, 12.0], [1.0, 5.0, 9.0, 13.0], [2.0,
+    6.0, 10.0, 14.0], [3.0, 7.0, 11.0, 15.0]] sequence_lens = [4, 3, 2, 1]
+    time_axis = 0 batch_axis = 1
+
+    output = [[3.0, 6.0, 9.0, 12.0], [2.0, 5.0, 8.0, 13.0], [1.0, 4.0, 10.0,
+    14.0], [0.0, 7.0, 11.0, 15.0]]
+
+    Example 2: input = [[0.0, 1.0, 2.0, 3.0 ], [4.0, 5.0, 6.0, 7.0 ], [8.0,
+    9.0, 10.0, 11.0], [12.0, 13.0, 14.0, 15.0]] sequence_lens = [1, 2, 3, 4]
+    time_axis = 1 batch_axis = 0
+
+    output = [[0.0, 1.0, 2.0, 3.0 ], [5.0, 4.0, 6.0, 7.0 ], [10.0, 9.0, 8.0,
+    11.0], [15.0, 14.0, 13.0, 12.0]]
 
     Parameters
     ==========
@@ -10853,13 +12237,16 @@ def reverse_sequence(
         Tensor of rank r >= 2.
     sequence_lens
         Type tensor(int64).
-        Tensor specifying lengths of the sequences in a batch. It has shape `[batch_size]`.
+        Tensor specifying lengths of the sequences in a batch. It has shape
+        ``[batch_size]``.
     batch_axis
         Attribute.
-        (Optional) Specify which axis is batch axis. Must be one of 1 (default), or 0.
+        (Optional) Specify which axis is batch axis. Must be one of 1 (default),
+        or 0.
     time_axis
         Attribute.
-        (Optional) Specify which axis is time axis. Must be one of 0 (default), or 1.
+        (Optional) Specify which axis is time axis. Must be one of 0 (default),
+        or 1.
 
     Returns
     =======
@@ -10899,34 +12286,44 @@ def roi_align(
     spatial_scale: float = 1.0,
 ) -> Var:
     r"""
-    Region of Interest (RoI) align operation described in the
-    Mask R-CNN paper (https://arxiv.org/abs/1703.06870).
-    RoiAlign consumes an input tensor X and region of interests (rois)
-    to apply pooling across each RoI; it produces a 4-D tensor of shape
-    (num_rois, C, output_height, output_width).
-    RoiAlign is proposed to avoid the misalignment by removing
-    quantizations while converting from original image into feature
-    map and from feature map into RoI feature; in each ROI bin,
-    the value of the sampled locations are computed directly
-    through bilinear interpolation.
+    Region of Interest (RoI) align operation described in the `Mask R-CNN
+    paper <https://arxiv.org/abs/1703.06870>`__. RoiAlign consumes an input
+    tensor X and region of interests (rois) to apply pooling across each
+    RoI; it produces a 4-D tensor of shape (num_rois, C, output_height,
+    output_width).
+
+    RoiAlign is proposed to avoid the misalignment by removing quantizations
+    while converting from original image into feature map and from feature
+    map into RoI feature; in each ROI bin, the value of the sampled
+    locations are computed directly through bilinear interpolation.
 
     Parameters
     ==========
     X
         Type T1.
-        Input data tensor from the previous operator; 4-D feature map of shape (N, C, H, W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data.
+        Input data tensor from the previous operator; 4-D feature map of shape
+        (N, C, H, W), where N is the batch size, C is the number of channels,
+        and H and W are the height and the width of the data.
     rois
         Type T1.
-        RoIs (Regions of Interest) to pool over; rois is 2-D input of shape (num_rois, 4) given as [[x1, y1, x2, y2], ...]. The RoIs' coordinates are in the coordinate system of the input image. Each coordinate set has a 1:1 correspondence with the 'batch_indices' input.
+        RoIs (Regions of Interest) to pool over; rois is 2-D input of shape
+        (num_rois, 4) given as [[x1, y1, x2, y2], ...]. The RoIs' coordinates
+        are in the coordinate system of the input image. Each coordinate set has
+        a 1:1 correspondence with the 'batch_indices' input.
     batch_indices
         Type T2.
-        1-D tensor of shape (num_rois,) with each element denoting the index of the corresponding image in the batch.
+        1-D tensor of shape (num_rois,) with each element denoting the index of
+        the corresponding image in the batch.
     coordinate_transformation_mode
         Attribute.
-        Allowed values are 'half_pixel' and 'output_half_pixel'. Use the value 'half_pixel' to pixel shift the input coordinates by -0.5 (the recommended behavior). Use the value 'output_half_pixel' to omit the pixel shift for the input (use this for a backward-compatible behavior).
+        Allowed values are 'half_pixel' and 'output_half_pixel'. Use the value
+        'half_pixel' to pixel shift the input coordinates by -0.5 (the
+        recommended behavior). Use the value 'output_half_pixel' to omit the
+        pixel shift for the input (use this for a backward-compatible behavior).
     mode
         Attribute.
-        The pooling method. Two modes are supported: 'avg' and 'max'. Default is 'avg'.
+        The pooling method. Two modes are supported: 'avg' and 'max'. Default is
+        'avg'.
     output_height
         Attribute.
         default 1; Pooled output Y's height.
@@ -10935,16 +12332,25 @@ def roi_align(
         default 1; Pooled output Y's width.
     sampling_ratio
         Attribute.
-        Number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, then exactly sampling_ratio x sampling_ratio grid points are used. If == 0, then an adaptive number of grid points are used (computed as ceil(roi_width / output_width), and likewise for height). Default is 0.
+        Number of sampling points in the interpolation grid used to compute the
+        output value of each pooled output bin. If > 0, then exactly
+        sampling_ratio x sampling_ratio grid points are used. If == 0, then an
+        adaptive number of grid points are used (computed as ceil(roi_width /
+        output_width), and likewise for height). Default is 0.
     spatial_scale
         Attribute.
-        Multiplicative spatial scale factor to translate ROI coordinates from their input spatial scale to the scale used when pooling, i.e., spatial scale of the input feature map X relative to the input image. E.g.; default is 1.0f.
+        Multiplicative spatial scale factor to translate ROI coordinates from
+        their input spatial scale to the scale used when pooling, i.e., spatial
+        scale of the input feature map X relative to the input image. E.g.;
+        default is 1.0f.
 
     Returns
     =======
     Y : Var
         Type T1.
-        RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element Y[r-1] is a pooled feature map corresponding to the r-th RoI X[r-1].
+        RoI pooled output, 4-D tensor of shape (num_rois, C, output_height,
+        output_width). The r-th batch element Y[r-1] is a pooled feature map
+        corresponding to the r-th RoI X[r-1].
 
     Notes
     =====
@@ -10975,18 +12381,20 @@ def round(
     X: Var,
 ) -> Var:
     r"""
-    Round takes one input Tensor and rounds the values, element-wise, meaning
-    it finds the nearest integer for each value.
-    In case of halfs, the rule is to round them to the nearest even integer.
-    The output tensor has the same shape and type as the input.
+    Round takes one input Tensor and rounds the values, element-wise,
+    meaning it finds the nearest integer for each value. In case of halfs,
+    the rule is to round them to the nearest even integer. The output tensor
+    has the same shape and type as the input.
+
     Examples:
-    ```
-    round([0.9]) = [1.0]
-    round([2.5]) = [2.0]
-    round([2.3]) = [2.0]
-    round([1.5]) = [2.0]
-    round([-4.5]) = [-4.0]
-    ```
+
+    ::
+
+       round([0.9]) = [1.0]
+       round([2.5]) = [2.0]
+       round([2.3]) = [2.0]
+       round([1.5]) = [2.0]
+       round([-4.5]) = [-4.0]
 
     Parameters
     ==========
@@ -11030,25 +12438,43 @@ def stft(
     ==========
     signal
         Type T1.
-        Input tensor representing a real or complex valued signal. For real input, the following shape is expected: [batch_size][signal_length][1]. For complex input, the following shape is expected: [batch_size][signal_length][2], where [batch_size][signal_length][0] represents the real component and [batch_size][signal_length][1] represents the imaginary component of the signal.
+        Input tensor representing a real or complex valued signal. For real
+        input, the following shape is expected: [batch_size][signal_length][1].
+        For complex input, the following shape is expected:
+        [batch_size][signal_length][2], where [batch_size][signal_length][0]
+        represents the real component and [batch_size][signal_length][1]
+        represents the imaginary component of the signal.
     frame_step
         Type T2.
         The number of samples to step between successive DFTs.
     window
         Type T1.
-        A tensor representing the window that will be slid over the signal.The window must have rank 1 with shape: [window_shape]. It's an optional value.
+        A tensor representing the window that will be slid over the signal.The
+        window must have rank 1 with shape: [window_shape]. It's an optional
+        value.
     frame_length
         Type T2.
         A scalar representing the size of the DFT. It's an optional value.
     onesided
         Attribute.
-        If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if the input or window tensors are complex, then onesided output is not possible. Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT).When invoked with real or complex valued input, the default value is 1. Values can be 0 or 1.
+        If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) +
+        1] are returned because the real-to-complex Fourier transform satisfies
+        the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. Note if
+        the input or window tensors are complex, then onesided output is not
+        possible. Enabling onesided with real inputs performs a Real-valued fast
+        Fourier transform (RFFT).When invoked with real or complex valued input,
+        the default value is 1. Values can be 0 or 1.
 
     Returns
     =======
     output : Var
         Type T1.
-        The Short-time Fourier Transform of the signals.If onesided is 1, the output has the shape: [batch_size][frames][dft_unique_bins][2], where dft_unique_bins is frame_length // 2 + 1 (the unique components of the DFT) If onesided is 0, the output has the shape: [batch_size][frames][frame_length][2], where frame_length is the length of the DFT.
+        The Short-time Fourier Transform of the signals.If onesided is 1, the
+        output has the shape: [batch_size][frames][dft_unique_bins][2], where
+        dft_unique_bins is frame_length // 2 + 1 (the unique components of the
+        DFT) If onesided is 0, the output has the shape:
+        [batch_size][frames][frame_length][2], where frame_length is the length
+        of the DFT.
 
     Notes
     =====
@@ -11083,106 +12509,143 @@ def scan(
 ) -> Sequence[Var]:
     r"""
     Scan can be used to iterate over one or more scan_input tensors,
-    constructing zero or more scan_output tensors. It combines ideas from general recurrences,
-    functional programming constructs such as scan, fold, map, and zip, and is intended to enable
-    generalizations of RNN-like constructs for sequence-to-sequence processing.
-    Other tensors (referred to as state_variables here) can be used to carry a state
-    when iterating from one element to another (similar to hidden-state in RNNs, also referred
-    to as loop-carried dependences in the context of loops).
-    Many common usages involve a single scan_input tensor (where functionality
-    similar to scan, fold and map can be obtained). When more than one scan_input is used,
-    a behavior similar to zip is obtained.
-    The attribute body must be a graph, specifying the computation to be performed in
-    every iteration. It takes as input the current values of the state_variables and
-    the current iterated element of the scan_inputs. It must return the (updated) values
-    of the state_variables and zero or more scan_output_element tensors. The values of the
-    scan_output_element tensors are concatenated over all the iterations to produce the
-    scan_output values of the scan construct (similar to the concatenated intermediate
-    hidden-state values of RNN-like constructs). All the output tensors (state_variables as
-    well as scan_output_element tensors) are required to have the same shape in each iteration
-    of the loop (a restriction imposed to enable efficient memory allocation).
-    Note that the iterated element passed to the body subgraph does not have a sequence
-    axis. It will have a rank one less than the rank of the corresponding scan_input.
-    The scan operation returns the final values of the state_variables as well as the
-    scan_outputs.
-    The optional attribute scan_input_directions specifies the direction (forward or backward)
-    for each scan input. If this attribute is omitted, all sequences are scanned in the forward
-    direction. A bidirectional scan may be performed by specifying the same tensor input twice
-    in the scan_inputs, once with a forward direction, and once with a backward direction.
-    The scan_output of the operation is produced by concatenating the scan_output_element
-    values produced by the body in each iteration.  The optional attribute scan_output_directions
-    specifies the direction in which scan_output is constructed (by appending or prepending the
-    scan_output_element to scan_output in each iteration) for each scan_output. If this attribute
-    is omitted, the scan_output_element is appended to the scan_output in each iteration.
-    The optional attribute scan_input_axes specifies the axis to be scanned for each scan_input.
-    If omitted, every scan_input will be scanned in axis 0. For example, if axis 0 is the
-    batch axis and axis 1 is the time axis (to be scanned), specify an axis value of 1.
-    Note that scanning a non-zero axis may be less efficient than scanning axis zero.
-    The optional attribute scan_output_axes specifies the axis along which the scan_outputs
-    are accumulated for each scan_output. For example, if axis 1 is the time axis (to be
-    scanned) for both inputs and outputs, specify a scan_input axis and scan_output axis
-    value of 1.
-    Note that because of the ONNX restriction that only the last parameter of an operator can
-    be variadic, the initial-states and scan-inputs are listed together as one input parameter.
-    Similarly, the final-states and scan-outputs are listed together as one output parameter.
-    The attribute num_scan_inputs indicates the number M of scan-inputs.
+    constructing zero or more scan_output tensors. It combines ideas from
+    general recurrences, functional programming constructs such as scan,
+    fold, map, and zip, and is intended to enable generalizations of
+    RNN-like constructs for sequence-to-sequence processing. Other tensors
+    (referred to as state_variables here) can be used to carry a state when
+    iterating from one element to another (similar to hidden-state in RNNs,
+    also referred to as loop-carried dependences in the context of loops).
+    Many common usages involve a single scan_input tensor (where
+    functionality similar to scan, fold and map can be obtained). When more
+    than one scan_input is used, a behavior similar to zip is obtained.
+
+    The attribute body must be a graph, specifying the computation to be
+    performed in every iteration. It takes as input the current values of
+    the state_variables and the current iterated element of the scan_inputs.
+    It must return the (updated) values of the state_variables and zero or
+    more scan_output_element tensors. The values of the scan_output_element
+    tensors are concatenated over all the iterations to produce the
+    scan_output values of the scan construct (similar to the concatenated
+    intermediate hidden-state values of RNN-like constructs). All the output
+    tensors (state_variables as well as scan_output_element tensors) are
+    required to have the same shape in each iteration of the loop (a
+    restriction imposed to enable efficient memory allocation).
+
+    Note that the iterated element passed to the body subgraph does not have
+    a sequence axis. It will have a rank one less than the rank of the
+    corresponding scan_input.
+
+    The scan operation returns the final values of the state_variables as
+    well as the scan_outputs.
+
+    The optional attribute scan_input_directions specifies the direction
+    (forward or backward) for each scan input. If this attribute is omitted,
+    all sequences are scanned in the forward direction. A bidirectional scan
+    may be performed by specifying the same tensor input twice in the
+    scan_inputs, once with a forward direction, and once with a backward
+    direction.
+
+    The scan_output of the operation is produced by concatenating the
+    scan_output_element values produced by the body in each iteration. The
+    optional attribute scan_output_directions specifies the direction in
+    which scan_output is constructed (by appending or prepending the
+    scan_output_element to scan_output in each iteration) for each
+    scan_output. If this attribute is omitted, the scan_output_element is
+    appended to the scan_output in each iteration.
+
+    The optional attribute scan_input_axes specifies the axis to be scanned
+    for each scan_input. If omitted, every scan_input will be scanned in
+    axis 0. For example, if axis 0 is the batch axis and axis 1 is the time
+    axis (to be scanned), specify an axis value of 1. Note that scanning a
+    non-zero axis may be less efficient than scanning axis zero.
+
+    The optional attribute scan_output_axes specifies the axis along which
+    the scan_outputs are accumulated for each scan_output. For example, if
+    axis 1 is the time axis (to be scanned) for both inputs and outputs,
+    specify a scan_input axis and scan_output axis value of 1.
+
+    Note that because of the ONNX restriction that only the last parameter
+    of an operator can be variadic, the initial-states and scan-inputs are
+    listed together as one input parameter. Similarly, the final-states and
+    scan-outputs are listed together as one output parameter. The attribute
+    num_scan_inputs indicates the number M of scan-inputs.
+
     The behavior of
-        Scan <
-            num_scan_inputs = m,
-            body = loop-body,
-            scan_input_axes = [axis_1, ..., axis_m]
-        > (init_1, ..., init_n, scan_1, ..., scan_m)
+
+    ::
+
+       Scan <
+           num_scan_inputs = m,
+           body = loop-body,
+           scan_input_axes = [axis_1, ..., axis_m]
+       > (init_1, ..., init_n, scan_1, ..., scan_m)
+
     is equivalent to the following pseudo-code:
-        // scan_i.shape[axis_i] denotes the (max) sequence-length of scan_i
-        // scan_i.shape[axis_i] is required to be equal to scan_j.shape[axis_j] for all i,j.
-        sequence_length = scan_1.shape[axis_1];
-        // initialize state-variables
-        st_1 = init_1; ... st_n = init_n;
-        // initialize scan-output variables: [] denotes an empty tensor
-        scan_out_1 = []; ...; scan_out_k = [];
-        // identify number of iterations:
-        // execute loop
-        for (int t = 0; t < sequence_length; ++t) {
-            // generate the scan-input elements: the notation T<axis=k>[t] indicates the sub-tensor
-            // of rank one less than T obtained by indexing T at position t along axis k.
-            si_1 = scan_1<axis=axis_1>[t];
-            ... ;
-            si_m = scan_m<axis=axis_m>[t];
-            // execute loop-body
-            st_1, ..., st_n, so_1, ..., so_k = loop-body(st_1, ..., st_n, si_1, ..., si_m)
-            // accumulate the scan-output elements
-            scan_out_1 = Concat<axis=0>(scan_out_1, so_1); ... ; scan_out_k = Concat<axis=0>(scan_out_k, so_k);
-        }
-        return st_1, ..., st_n, scan_out_1, ..., scan_out_k;
+
+    ::
+
+       // scan_i.shape[axis_i] denotes the (max) sequence-length of scan_i
+       // scan_i.shape[axis_i] is required to be equal to scan_j.shape[axis_j] for all i,j.
+       sequence_length = scan_1.shape[axis_1];
+
+       // initialize state-variables
+       st_1 = init_1; ... st_n = init_n;
+       // initialize scan-output variables: [] denotes an empty tensor
+       scan_out_1 = []; ...; scan_out_k = [];
+       // identify number of iterations:
+
+       // execute loop
+       for (int t = 0; t < sequence_length; ++t) {
+           // generate the scan-input elements: the notation T<axis=k>[t] indicates the sub-tensor
+           // of rank one less than T obtained by indexing T at position t along axis k.
+           si_1 = scan_1<axis=axis_1>[t];
+           ... ;
+           si_m = scan_m<axis=axis_m>[t];
+           // execute loop-body
+           st_1, ..., st_n, so_1, ..., so_k = loop-body(st_1, ..., st_n, si_1, ..., si_m)
+           // accumulate the scan-output elements
+           scan_out_1 = Concat<axis=0>(scan_out_1, so_1); ... ; scan_out_k = Concat<axis=0>(scan_out_k, so_k);
+       }
+
+       return st_1, ..., st_n, scan_out_1, ..., scan_out_k;
+
     *Sample usage: Encoding RNN using a Scan*
-    The following example shows how a simple RNN over an input tensor %X, with weight tensor %Wi,
-    recurrence weight tensor %Ri, bias tensors %Wbi and %Rbi, and initial hidden-state %H_0 can
-    be encoded as a ScanLoop. Note that the loop-body is a nested graph, and it directly computes
-    %Wi, %Ri, %Wbi, and %Rbi (typically constants or initializers in the body graph). If these
-    values are computed in the outer graph, they need to be passed in as extra state_variables.
-        graph rnn-encoding {
-          %H_0 = ...
-          %X = ...
-          %Y_h, %Y = Scanbody = <graph rnn-cell-1>, num_scan_inputs=1 (%H_0, %X)
-          return %Y, %Y_h
-        }
-        graph rnn-cell-1 (
-          %H_tminus1[FLOAT, tensor]
-          %X_t[FLOAT, tensor]
-        ) {
-          %Wi = ...
-          %Ri = ...
-          %Wbi = ...
-          %Rbi = ...
-          %t1 = X_t * (Wi^T)
-          %t2 = H_tminus1*(Ri^T)
-          %t3 = Add(%t1, %t2)
-          %t4 = Add(%t3, %Wbi)
-          %t5 = Add(%t4, %Rbi)
-          %Ht = Tanh(%t5)
-          %Accumulate = Identity(%Ht)
-          return %Ht, %Accumulate
-        }
+
+    The following example shows how a simple RNN over an input tensor %X,
+    with weight tensor %Wi, recurrence weight tensor %Ri, bias tensors %Wbi
+    and %Rbi, and initial hidden-state %H_0 can be encoded as a ScanLoop.
+    Note that the loop-body is a nested graph, and it directly computes %Wi,
+    %Ri, %Wbi, and %Rbi (typically constants or initializers in the body
+    graph). If these values are computed in the outer graph, they need to be
+    passed in as extra state_variables.
+
+    ::
+
+       graph rnn-encoding {
+         %H_0 = ...
+         %X = ...
+         %Y_h, %Y = Scan[body = <graph rnn-cell-1>, num_scan_inputs=1](%H_0, %X)
+         return %Y, %Y_h
+       }
+
+       graph rnn-cell-1 (
+         %H_tminus1[FLOAT, tensor]
+         %X_t[FLOAT, tensor]
+       ) {
+         %Wi = ...
+         %Ri = ...
+         %Wbi = ...
+         %Rbi = ...
+         %t1 = X_t * (Wi^T)
+         %t2 = H_tminus1*(Ri^T)
+         %t3 = Add(%t1, %t2)
+         %t4 = Add(%t3, %Wbi)
+         %t5 = Add(%t4, %Rbi)
+         %Ht = Tanh(%t5)
+         %Accumulate = Identity(%Ht)
+         return %Ht, %Accumulate
+       }
 
     Parameters
     ==========
@@ -11191,22 +12654,42 @@ def scan(
         Initial values of the loop's N state variables followed by M scan_inputs
     body
         Attribute.
-        The graph run each iteration. It has N+M inputs: (loop state variables..., scan_input_elts...). It has N+K outputs: (loop state variables..., scan_output_elts...). Each scan_output is created by concatenating the value of the specified scan_output_elt value at the end of each iteration of the loop. It is an error if the dimensions of these values change across loop iterations.
+        The graph run each iteration. It has N+M inputs: (loop state
+        variables..., scan_input_elts...). It has N+K outputs: (loop state
+        variables..., scan_output_elts...). Each scan_output is created by
+        concatenating the value of the specified scan_output_elt value at the
+        end of each iteration of the loop. It is an error if the dimensions of
+        these values change across loop iterations.
     num_scan_inputs
         Attribute.
         An attribute specifying the number of scan_inputs M.
     scan_input_axes
         Attribute.
-        An optional list of M flags. The i-th element of the list specifies the axis to be scanned (the sequence axis) for the i-th scan_input. If omitted, 0 will be used as the scan axis for every scan_input. Negative value for an axis means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        An optional list of M flags. The i-th element of the list specifies the
+        axis to be scanned (the sequence axis) for the i-th scan_input. If
+        omitted, 0 will be used as the scan axis for every scan_input. Negative
+        value for an axis means counting dimensions from the back. Accepted
+        range is [-r, r-1] where r = rank(input).
     scan_input_directions
         Attribute.
-        An optional list of M flags. The i-th element of the list specifies the direction to be scanned for the i-th scan_input tensor: 0 indicates forward direction and 1 indicates reverse direction. If omitted, all scan_input tensors will be scanned in the forward direction.
+        An optional list of M flags. The i-th element of the list specifies the
+        direction to be scanned for the i-th scan_input tensor: 0 indicates
+        forward direction and 1 indicates reverse direction. If omitted, all
+        scan_input tensors will be scanned in the forward direction.
     scan_output_axes
         Attribute.
-        An optional list of K flags. The i-th element of the list specifies the axis for the i-th scan_output. The scan outputs are accumulated along the specified axis. If omitted, 0 will be used as the scan axis for every scan_output. Negative value for an axis means counting dimensions from the back. Accepted range is [-r, r-1].
+        An optional list of K flags. The i-th element of the list specifies the
+        axis for the i-th scan_output. The scan outputs are accumulated along
+        the specified axis. If omitted, 0 will be used as the scan axis for
+        every scan_output. Negative value for an axis means counting dimensions
+        from the back. Accepted range is [-r, r-1].
     scan_output_directions
         Attribute.
-        An optional list of K flags, one for each scan_output. The i-th element of the list specifies whether the i-th scan_output should be constructed by appending or prepending a new value in each iteration: 0 indicates appending and 1 indicates prepending. If omitted, all scan_output tensors will be produced by appending a value in each iteration.
+        An optional list of K flags, one for each scan_output. The i-th element
+        of the list specifies whether the i-th scan_output should be constructed
+        by appending or prepending a new value in each iteration: 0 indicates
+        appending and 1 indicates prepending. If omitted, all scan_output
+        tensors will be produced by appending a value in each iteration.
 
     Returns
     =======
@@ -11267,54 +12750,66 @@ def scatter(
     axis: int = 0,
 ) -> Var:
     r"""
-    This operator is deprecated. Please use ScatterElements, which provides the same functionality.
-    Scatter takes three inputs `data`, `updates`, and `indices` of the same
-    rank r >= 1 and an optional attribute axis that identifies an axis of `data`
-    (by default, the outer-most axis, that is axis 0). The output of the operation
-    is produced by creating a copy of the input `data`, and then updating its value
-    to values specified by `updates` at specific index positions specified by
-    `indices`. Its output shape is the same as the shape of `data`.
-    For each entry in `updates`, the target index in `data` is obtained by combining
-    the corresponding entry in `indices` with the index of the entry itself: the
-    index-value for dimension = axis is obtained from the value of the corresponding
-    entry in `indices` and the index-value for dimension != axis is obtained from the
-    index of the entry itself.
-    For instance, in a 2-D tensor case, the update corresponding to the [i][j] entry
-    is performed as below:
-    ```
-      output[indices[i][j]][j] = updates[i][j] if axis = 0,
-      output[i][indices[i][j]] = updates[i][j] if axis = 1,
-    ```
-    This operator is the inverse of GatherElements. It is similar to Torch's Scatter operation.
+    This operator is deprecated. Please use ScatterElements, which provides
+    the same functionality.
+
+    Scatter takes three inputs ``data``, ``updates``, and ``indices`` of the
+    same rank r >= 1 and an optional attribute axis that identifies an axis
+    of ``data`` (by default, the outer-most axis, that is axis 0). The
+    output of the operation is produced by creating a copy of the input
+    ``data``, and then updating its value to values specified by ``updates``
+    at specific index positions specified by ``indices``. Its output shape
+    is the same as the shape of ``data``.
+
+    For each entry in ``updates``, the target index in ``data`` is obtained
+    by combining the corresponding entry in ``indices`` with the index of
+    the entry itself: the index-value for dimension = axis is obtained from
+    the value of the corresponding entry in ``indices`` and the index-value
+    for dimension != axis is obtained from the index of the entry itself.
+
+    For instance, in a 2-D tensor case, the update corresponding to the
+    [i][j] entry is performed as below:
+
+    ::
+
+         output[indices[i][j]][j] = updates[i][j] if axis = 0,
+         output[i][indices[i][j]] = updates[i][j] if axis = 1,
+
+    This operator is the inverse of GatherElements. It is similar to Torch's
+    Scatter operation.
+
     Example 1:
-    ```
-      data = [
-          [0.0, 0.0, 0.0],
-          [0.0, 0.0, 0.0],
-          [0.0, 0.0, 0.0],
-      ]
-      indices = [
-          [1, 0, 2],
-          [0, 2, 1],
-      ]
-      updates = [
-          [1.0, 1.1, 1.2],
-          [2.0, 2.1, 2.2],
-      ]
-      output = [
-          [2.0, 1.1, 0.0]
-          [1.0, 0.0, 2.2]
-          [0.0, 2.1, 1.2]
-      ]
-    ```
+
+    ::
+
+         data = [
+             [0.0, 0.0, 0.0],
+             [0.0, 0.0, 0.0],
+             [0.0, 0.0, 0.0],
+         ]
+         indices = [
+             [1, 0, 2],
+             [0, 2, 1],
+         ]
+         updates = [
+             [1.0, 1.1, 1.2],
+             [2.0, 2.1, 2.2],
+         ]
+         output = [
+             [2.0, 1.1, 0.0]
+             [1.0, 0.0, 2.2]
+             [0.0, 2.1, 1.2]
+         ]
+
     Example 2:
-    ```
-      data = [[1.0, 2.0, 3.0, 4.0, 5.0]]
-      indices = [[1, 3]]
-      updates = [[1.1, 2.1]]
-      axis = 1
-      output = [[1.0, 1.1, 3.0, 2.1, 5.0]]
-    ```
+
+    ::
+
+         data = [[1.0, 2.0, 3.0, 4.0, 5.0]]
+         indices = [[1, 3]]
+         updates = [[1.1, 2.1]]
+         axis = 1
+         output = [[1.0, 1.1, 3.0, 2.1, 5.0]]
 
     Parameters
     ==========
@@ -11323,13 +12818,16 @@ def scatter(
         Tensor of rank r >= 1.
     indices
         Type Tind.
-        Tensor of int32/int64 indices, of r >= 1 (same rank as input). All index values are expected to be within bounds [-s, s-1] along axis of size s. It is an error if any of the index values are out of bounds.
+        Tensor of int32/int64 indices, of r >= 1 (same rank as input). All index
+        values are expected to be within bounds [-s, s-1] along axis of size s.
+        It is an error if any of the index values are out of bounds.
     updates
         Type T.
         Tensor of rank r >=1 (same rank and shape as indices)
     axis
         Attribute.
-        Which axis to scatter on. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).
+        Which axis to scatter on. Negative value means counting dimensions from
+        the back. Accepted range is [-r, r-1] where r = rank(data).
 
     Returns
     =======
@@ -11366,66 +12864,79 @@ def scatter_elements(
     reduction: str = "none",
 ) -> Var:
     r"""
-    ScatterElements takes three inputs `data`, `updates`, and `indices` of the same
-    rank r >= 1 and an optional attribute axis that identifies an axis of `data`
-    (by default, the outer-most axis, that is axis 0). The output of the operation
-    is produced by creating a copy of the input `data`, and then updating its value
-    to values specified by `updates` at specific index positions specified by
-    `indices`. Its output shape is the same as the shape of `data`.
-    For each entry in `updates`, the target index in `data` is obtained by combining
-    the corresponding entry in `indices` with the index of the entry itself: the
-    index-value for dimension = axis is obtained from the value of the corresponding
-    entry in `indices` and the index-value for dimension != axis is obtained from the
-    index of the entry itself.
-    `reduction` allows specification of an optional reduction operation, which is applied to all values in `updates`
-    tensor into `output` at the specified `indices`.
-    In cases where `reduction` is set to "none", indices should not have duplicate entries: that is, if idx1 != idx2,
-    then indices[idx1] != indices[idx2]. For instance, in a 2-D tensor case, the update
-    corresponding to the [i][j] entry is performed as below:
-    ```
-      output[indices[i][j]][j] = updates[i][j] if axis = 0,
-      output[i][indices[i][j]] = updates[i][j] if axis = 1,
-    ```
-    When `reduction` is set to "add", the update corresponding to the [i][j] entry is performed as below:
-    ```
-      output[indices[i][j]][j] += updates[i][j] if axis = 0,
-      output[i][indices[i][j]] += updates[i][j] if axis = 1,
-    ```
-    When `reduction` is set to "mul", the update corresponding to the [i][j] entry is performed as below:
-    ```
-      output[indices[i][j]][j] *= updates[i][j] if axis = 0,
-      output[i][indices[i][j]] *= updates[i][j] if axis = 1,
-    ```
-    This operator is the inverse of GatherElements. It is similar to Torch's Scatter operation.
-    Example 1:
-    ```
-      data = [
-          [0.0, 0.0, 0.0],
-          [0.0, 0.0, 0.0],
-          [0.0, 0.0, 0.0],
-      ]
-      indices = [
-          [1, 0, 2],
-          [0, 2, 1],
-      ]
-      updates = [
-          [1.0, 1.1, 1.2],
-          [2.0, 2.1, 2.2],
-      ]
-      output = [
-          [2.0, 1.1, 0.0]
-          [1.0, 0.0, 2.2]
-          [0.0, 2.1, 1.2]
-      ]
-    ```
+    ScatterElements takes three inputs ``data``, ``updates``, and
+    ``indices`` of the same rank r >= 1 and an optional attribute axis that
+    identifies an axis of ``data`` (by default, the outer-most axis, that is
+    axis 0). The output of the operation is produced by creating a copy of
+    the input ``data``, and then updating its value to values specified by
+    ``updates`` at specific index positions specified by ``indices``. Its
+    output shape is the same as the shape of ``data``. For each entry in
+    ``updates``, the target index in ``data`` is obtained by combining the
+    corresponding entry in ``indices`` with the index of the entry itself:
+    the index-value for dimension = axis is obtained from the value of the
+    corresponding entry in ``indices`` and the index-value for dimension !=
+    axis is obtained from the index of the entry itself. ``reduction``
+    allows specification of an optional reduction operation, which is
+    applied to all values in ``updates`` tensor into ``output`` at the
+    specified ``indices``. In cases where ``reduction`` is set to "none",
+    indices should not have duplicate entries: that is, if idx1 != idx2,
+    then indices[idx1] != indices[idx2]. For instance, in a 2-D tensor case,
+    the update corresponding to the [i][j] entry is performed as below:
+
+    ::
+
+         output[indices[i][j]][j] = updates[i][j] if axis = 0,
+         output[i][indices[i][j]] = updates[i][j] if axis = 1,
+
+    When ``reduction`` is set to "add", the update corresponding to the
+    [i][j] entry is performed as below:
+
+    ::
+
+         output[indices[i][j]][j] += updates[i][j] if axis = 0,
+         output[i][indices[i][j]] += updates[i][j] if axis = 1,
+
+    When ``reduction`` is set to "mul", the update corresponding to the
+    [i][j] entry is performed as below:
+
+    ::
+
+         output[indices[i][j]][j] *= updates[i][j] if axis = 0,
+         output[i][indices[i][j]] *= updates[i][j] if axis = 1,
+
+    This operator is the inverse of GatherElements. It is similar to Torch's
+    Scatter operation. Example 1:
+
+    ::
+
+         data = [
+             [0.0, 0.0, 0.0],
+             [0.0, 0.0, 0.0],
+             [0.0, 0.0, 0.0],
+         ]
+         indices = [
+             [1, 0, 2],
+             [0, 2, 1],
+         ]
+         updates = [
+             [1.0, 1.1, 1.2],
+             [2.0, 2.1, 2.2],
+         ]
+         output = [
+             [2.0, 1.1, 0.0]
+             [1.0, 0.0, 2.2]
+             [0.0, 2.1, 1.2]
+         ]
+
     Example 2:
-    ```
-      data = [[1.0, 2.0, 3.0, 4.0, 5.0]]
-      indices = [[1, 3]]
-      updates = [[1.1, 2.1]]
-      axis = 1
-      output = [[1.0, 1.1, 3.0, 2.1, 5.0]]
-    ```
+
+    ::
+
+         data = [[1.0, 2.0, 3.0, 4.0, 5.0]]
+         indices = [[1, 3]]
+         updates = [[1.1, 2.1]]
+         axis = 1
+         output = [[1.0, 1.1, 3.0, 2.1, 5.0]]
 
     Parameters
     ==========
@@ -11434,16 +12945,21 @@ def scatter_elements(
         Tensor of rank r >= 1.
     indices
         Type Tind.
-        Tensor of int32/int64 indices, of r >= 1 (same rank as input). All index values are expected to be within bounds [-s, s-1] along axis of size s. It is an error if any of the index values are out of bounds.
+        Tensor of int32/int64 indices, of r >= 1 (same rank as input). All index
+        values are expected to be within bounds [-s, s-1] along axis of size s.
+        It is an error if any of the index values are out of bounds.
     updates
         Type T.
         Tensor of rank r >=1 (same rank and shape as indices)
     axis
         Attribute.
-        Which axis to scatter on. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).
+        Which axis to scatter on. Negative value means counting dimensions from
+        the back. Accepted range is [-r, r-1] where r = rank(data).
     reduction
         Attribute.
-        Type of reduction to apply: none (default), add, mul. 'none': no reduction applied. 'add':  reduction using the addition operation. 'mul': reduction using the multiplication operation.
+        Type of reduction to apply: none (default), add, mul. 'none': no
+        reduction applied. 'add': reduction using the addition operation. 'mul':
+        reduction using the multiplication operation.
 
     Returns
     =======
@@ -11480,68 +12996,76 @@ def scatter_nd(
     reduction: str = "none",
 ) -> Var:
     r"""
-    ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1,
-    and `updates` tensor of rank q + r - indices.shape[-1] - 1. The output of the operation
-    is produced by creating a copy of the input `data`, and then updating its value to values
-    specified by `updates` at specific index positions specified by `indices`. Its output shape
-    is the same as the shape of `data`.
-    `indices` is an integer tensor. Let k denote indices.shape[-1], the last dimension in the shape of `indices`.
-     `indices` is treated as a (q-1)-dimensional tensor of k-tuples, where each k-tuple is a partial-index into `data`.
-    Hence, k can be a value at most the rank of `data`. When k equals rank(data), each update entry specifies an
-    update to a single element of the tensor. When k is less than rank(data) each update entry specifies an
-    update to a slice of the tensor. Index values are allowed to be negative, as per the usual
-    convention for counting backwards from the end, but are expected in the valid range.
-    `updates` is treated as a (q-1)-dimensional tensor of replacement-slice-values. Thus, the
-    first (q-1) dimensions of updates.shape must match the first (q-1) dimensions of indices.shape.
-    The remaining dimensions of `updates` correspond to the dimensions of the
-    replacement-slice-values. Each replacement-slice-value is a (r-k) dimensional tensor,
-    corresponding to the trailing (r-k) dimensions of `data`.  Thus, the shape of `updates`
-    must equal indices.shape[0:q-1] ++ data.shape[k:r-1], where ++ denotes the concatenation
-    of shapes.
-    The `output` is calculated via the following equation:
-        output = np.copy(data)
-        update_indices = indices.shape[:-1]
-        for idx in np.ndindex(update_indices):
-            output[indices[idx]] = updates[idx]
-    The order of iteration in the above loop is not specified.
-    In particular, indices should not have duplicate entries: that is, if idx1 != idx2, then indices[idx1] != indices[idx2].
-    This ensures that the output value does not depend on the iteration order.
-    `reduction` allows specification of an optional reduction operation, which is applied to all values in `updates`
-    tensor into `output` at the specified `indices`.
-    In cases where `reduction` is set to "none", indices should not have duplicate entries: that is, if idx1 != idx2,
-    then indices[idx1] != indices[idx2]. This ensures that the output value does not depend on the iteration order.
-    When `reduction` is set to "add", `output` is calculated as follows:
-        output = np.copy(data)
-        update_indices = indices.shape[:-1]
-        for idx in np.ndindex(update_indices):
-            output[indices[idx]] += updates[idx]
-    When `reduction` is set to "mul", `output` is calculated as follows:
-        output = np.copy(data)
-        update_indices = indices.shape[:-1]
-        for idx in np.ndindex(update_indices):
-            output[indices[idx]] *= updates[idx]
-    This operator is the inverse of GatherND.
-    Example 1:
-    ```
-      data    = [1, 2, 3, 4, 5, 6, 7, 8]
-      indices = [[4], [3], [1], [7]]
-      updates = [9, 10, 11, 12]
-      output  = [1, 11, 3, 10, 9, 6, 7, 12]
-    ```
+    ScatterND takes three inputs ``data`` tensor of rank r >= 1, ``indices``
+    tensor of rank q >= 1, and ``updates`` tensor of rank q + r -
+    indices.shape[-1] - 1. The output of the operation is produced by
+    creating a copy of the input ``data``, and then updating its value to
+    values specified by ``updates`` at specific index positions specified by
+    ``indices``. Its output shape is the same as the shape of ``data``.
+
+    ``indices`` is an integer tensor. Let k denote indices.shape[-1], the
+    last dimension in the shape of ``indices``. ``indices`` is treated as a
+    (q-1)-dimensional tensor of k-tuples, where each k-tuple is a
+    partial-index into ``data``. Hence, k can be a value at most the rank of
+    ``data``. When k equals rank(data), each update entry specifies an
+    update to a single element of the tensor. When k is less than rank(data)
+    each update entry specifies an update to a slice of the tensor. Index
+    values are allowed to be negative, as per the usual convention for
+    counting backwards from the end, but are expected in the valid range.
+
+    ``updates`` is treated as a (q-1)-dimensional tensor of
+    replacement-slice-values. Thus, the first (q-1) dimensions of
+    updates.shape must match the first (q-1) dimensions of indices.shape.
+    The remaining dimensions of ``updates`` correspond to the dimensions of
+    the replacement-slice-values. Each replacement-slice-value is a (r-k)
+    dimensional tensor, corresponding to the trailing (r-k) dimensions of
+    ``data``. Thus, the shape of ``updates`` must equal indices.shape[0:q-1]
+    ++ data.shape[k:r-1], where ++ denotes the concatenation of shapes.
+
+    The ``output`` is calculated via the following equation: output =
+    np.copy(data) update_indices = indices.shape[:-1] for idx in
+    np.ndindex(update_indices): output[indices[idx]] = updates[idx] The
+    order of iteration in the above loop is not specified. In particular,
+    indices should not have duplicate entries: that is, if idx1 != idx2,
+    then indices[idx1] != indices[idx2]. This ensures that the output value
+    does not depend on the iteration order.
+
+    ``reduction`` allows specification of an optional reduction operation,
+    which is applied to all values in ``updates`` tensor into ``output`` at
+    the specified ``indices``. In cases where ``reduction`` is set to
+    "none", indices should not have duplicate entries: that is, if idx1 !=
+    idx2, then indices[idx1] != indices[idx2]. This ensures that the output
+    value does not depend on the iteration order. When ``reduction`` is set
+    to "add", ``output`` is calculated as follows: output = np.copy(data)
+    update_indices = indices.shape[:-1] for idx in
+    np.ndindex(update_indices): output[indices[idx]] += updates[idx] When
+    ``reduction`` is set to "mul", ``output`` is calculated as follows:
+    output = np.copy(data) update_indices = indices.shape[:-1] for idx in
+    np.ndindex(update_indices): output[indices[idx]] \*= updates[idx] This
+    operator is the inverse of GatherND. Example 1:
+
+    ::
+
+         data    = [1, 2, 3, 4, 5, 6, 7, 8]
+         indices = [[4], [3], [1], [7]]
+         updates = [9, 10, 11, 12]
+         output  = [1, 11, 3, 10, 9, 6, 7, 12]
+
     Example 2:
-    ```
-      data    = [[[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
-                 [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
-                 [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]],
-                 [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]]]
-      indices = [[0], [2]]
-      updates = [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-                 [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]]]
-      output  = [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-                 [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
-                 [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]],
-                 [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]]]
-    ```
+
+    ::
+
+         data    = [[[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
+                    [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
+                    [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]],
+                    [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]]]
+         indices = [[0], [2]]
+         updates = [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+                    [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]]]
+         output  = [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+                    [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
+                    [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]],
+                    [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]]]
 
     Parameters
     ==========
@@ -11556,7 +13080,9 @@ def scatter_nd(
         Tensor of rank q + r - indices_shape[-1] - 1.
     reduction
         Attribute.
-        Type of reduction to apply: none (default), add, mul. 'none': no reduction applied. 'add':  reduction using the addition operation. 'mul': reduction using the multiplication operation.
+        Type of reduction to apply: none (default), add, mul. 'none': no
+        reduction applied. 'add': reduction using the addition operation. 'mul':
+        reduction using the multiplication operation.
 
     Returns
     =======
@@ -11592,8 +13118,8 @@ def selu(
     r"""
     Selu takes one input data (Tensor<T>) and produces one output data
     (Tensor<T>) where the scaled exponential linear unit function,
-    `y = gamma * (alpha * e^x - alpha) for x <= 0`, `y = gamma * x for x > 0`,
-    is applied to the tensor elementwise.
+    ``y = gamma * (alpha * e^x - alpha) for x <= 0``,
+    ``y = gamma * x for x > 0``, is applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -11602,10 +13128,12 @@ def selu(
         Input tensor
     alpha
         Attribute.
-        Coefficient of SELU default to 1.67326319217681884765625 (i.e., float32 approximation of 1.6732632423543772848170429916717).
+        Coefficient of SELU default to 1.67326319217681884765625 (i.e., float32
+        approximation of 1.6732632423543772848170429916717).
     gamma
         Attribute.
-        Coefficient of SELU default to 1.05070102214813232421875 (i.e., float32 approximation of 1.0507009873554804934193349852946).
+        Coefficient of SELU default to 1.05070102214813232421875 (i.e., float32
+        approximation of 1.0507009873554804934193349852946).
 
     Returns
     =======
@@ -11637,8 +13165,9 @@ def sequence_at(
 ) -> Var:
     r"""
     Outputs a tensor copy from the tensor at 'position' in 'input_sequence'.
-    Accepted range for 'position' is in `[-n, n - 1]`, where `n` is the number of tensors in 'input_sequence'.
-    Negative value means counting positions from the back.
+    Accepted range for 'position' is in ``[-n, n - 1]``, where ``n`` is the
+    number of tensors in 'input_sequence'. Negative value means counting
+    positions from the back.
 
     Parameters
     ==========
@@ -11647,7 +13176,11 @@ def sequence_at(
         Input sequence.
     position
         Type I.
-        Position of the tensor in the sequence. Negative value means counting positions from the back. Accepted range in `[-n, n - 1]`, where `n` is the number of tensors in 'input_sequence'. It is an error if any of the index values are out of bounds. It must be a scalar(tensor of empty shape).
+        Position of the tensor in the sequence. Negative value means counting
+        positions from the back. Accepted range in ``[-n, n - 1]``, where ``n``
+        is the number of tensors in 'input_sequence'. It is an error if any of
+        the index values are out of bounds. It must be a scalar(tensor of empty
+        shape).
 
     Returns
     =======
@@ -11677,8 +13210,8 @@ def sequence_construct(
     inputs: Sequence[Var],
 ) -> Var:
     r"""
-    Construct a tensor sequence containing 'inputs' tensors.
-    All tensors in 'inputs' must have the same data type.
+    Construct a tensor sequence containing 'inputs' tensors. All tensors in
+    'inputs' must have the same data type.
 
     Parameters
     ==========
@@ -11719,7 +13252,8 @@ def sequence_empty(
     ==========
     dtype
         Attribute.
-        (Optional) The data type of the tensors in the output sequence. The default type is 'float'.
+        (Optional) The data type of the tensors in the output sequence. The
+        default type is 'float'.
 
     Returns
     =======
@@ -11747,10 +13281,11 @@ def sequence_erase(
     position: Optional[Var] = None,
 ) -> Var:
     r"""
-    Outputs a tensor sequence that removes the tensor at 'position' from 'input_sequence'.
-    Accepted range for 'position' is in `[-n, n - 1]`, where `n` is the number of tensors in 'input_sequence'.
-    Negative value means counting positions from the back.
-    'position' is optional, by default it erases the last tensor from 'input_sequence'.
+    Outputs a tensor sequence that removes the tensor at 'position' from
+    'input_sequence'. Accepted range for 'position' is in ``[-n, n - 1]``,
+    where ``n`` is the number of tensors in 'input_sequence'. Negative value
+    means counting positions from the back. 'position' is optional, by
+    default it erases the last tensor from 'input_sequence'.
 
     Parameters
     ==========
@@ -11759,7 +13294,11 @@ def sequence_erase(
         Input sequence.
     position
         Type I.
-        Position of the tensor in the sequence. Negative value means counting positions from the back. Accepted range in `[-n, n - 1]`, where `n` is the number of tensors in 'input_sequence'. It is an error if any of the index values are out of bounds. It must be a scalar(tensor of empty shape).
+        Position of the tensor in the sequence. Negative value means counting
+        positions from the back. Accepted range in ``[-n, n - 1]``, where ``n``
+        is the number of tensors in 'input_sequence'. It is an error if any of
+        the index values are out of bounds. It must be a scalar(tensor of empty
+        shape).
 
     Returns
     =======
@@ -11790,11 +13329,12 @@ def sequence_insert(
     position: Optional[Var] = None,
 ) -> Var:
     r"""
-    Outputs a tensor sequence that inserts 'tensor' into 'input_sequence' at 'position'.
-    'tensor' must have the same data type as 'input_sequence'.
-    Accepted range for 'position' is in `[-n, n]`, where `n` is the number of tensors in 'input_sequence'.
-    Negative value means counting positions from the back.
-    'position' is optional, by default it inserts 'tensor' to the back of 'input_sequence'.
+    Outputs a tensor sequence that inserts 'tensor' into 'input_sequence' at
+    'position'. 'tensor' must have the same data type as 'input_sequence'.
+    Accepted range for 'position' is in ``[-n, n]``, where ``n`` is the
+    number of tensors in 'input_sequence'. Negative value means counting
+    positions from the back. 'position' is optional, by default it inserts
+    'tensor' to the back of 'input_sequence'.
 
     Parameters
     ==========
@@ -11806,7 +13346,12 @@ def sequence_insert(
         Input tensor to be inserted into the input sequence.
     position
         Type I.
-        Position in the sequence where the new tensor is inserted. It is optional and default is to insert to the back of the sequence. Negative value means counting positions from the back. Accepted range in `[-n, n]`, where `n` is the number of tensors in 'input_sequence'. It is an error if any of the index values are out of bounds. It must be a scalar(tensor of empty shape).
+        Position in the sequence where the new tensor is inserted. It is
+        optional and default is to insert to the back of the sequence. Negative
+        value means counting positions from the back. Accepted range in
+        ``[-n, n]``, where ``n`` is the number of tensors in 'input_sequence'.
+        It is an error if any of the index values are out of bounds. It must be
+        a scalar(tensor of empty shape).
 
     Returns
     =======
@@ -11837,7 +13382,8 @@ def sequence_length(
     input_sequence: Var,
 ) -> Var:
     r"""
-    Produces a scalar(tensor of empty shape) containing the number of tensors in 'input_sequence'.
+    Produces a scalar(tensor of empty shape) containing the number of
+    tensors in 'input_sequence'.
 
     Parameters
     ==========
@@ -11875,16 +13421,21 @@ def sequence_map(
 ) -> Sequence[Var]:
     r"""
     Applies a sub-graph to each sample in the input sequence(s).
-    Inputs can be either tensors or sequences, with the exception of the first input which must
-    be a sequence. The length of the first input sequence will determine the number of samples in the
-    outputs. Any other sequence inputs should have the same number of samples. The number of inputs
-    and outputs, should match the one of the subgraph.
-    For each i-th element in the output, a sample will be extracted from the input sequence(s) at
-    the i-th position and the sub-graph will be applied to it.
-    The outputs will contain the outputs of the sub-graph for each sample, in the same order as in
-    the input.
-    This operator assumes that processing each sample is independent and could executed in parallel
-    or in any order. Users cannot expect any specific ordering in which each subgraph is computed.
+
+    Inputs can be either tensors or sequences, with the exception of the
+    first input which must be a sequence. The length of the first input
+    sequence will determine the number of samples in the outputs. Any other
+    sequence inputs should have the same number of samples. The number of
+    inputs and outputs, should match the one of the subgraph.
+
+    For each i-th element in the output, a sample will be extracted from the
+    input sequence(s) at the i-th position and the sub-graph will be applied
+    to it. The outputs will contain the outputs of the sub-graph for each
+    sample, in the same order as in the input.
+
+    This operator assumes that processing each sample is independent and
+    could executed in parallel or in any order. Users cannot expect any
+    specific ordering in which each subgraph is computed.
 
     Parameters
     ==========
@@ -11896,7 +13447,9 @@ def sequence_map(
         Additional inputs to the graph
     body
         Attribute.
-        The graph to be run for each sample in the sequence(s). It should have as many inputs and outputs as inputs and outputs to the SequenceMap function.
+        The graph to be run for each sample in the sequence(s). It should have
+        as many inputs and outputs as inputs and outputs to the SequenceMap
+        function.
 
     Returns
     =======
@@ -11939,31 +13492,27 @@ def shape(
     start: int = 0,
 ) -> Var:
     r"""
-    Takes a tensor as input and outputs an 1D int64 tensor containing the shape of the input tensor.
-    Optional attributes start and end can be used to compute a slice of the input tensor's shape.
-    If start axis is omitted, the slice starts from axis 0.
-    The end axis, if specified, is exclusive (and the returned value will not include the size of that axis).
-    If the end axis is omitted, the axes upto the last one will be included.
-    Negative axes indicate counting back from the last axis.
-    Note that axes will be clamped to the range [0, r-1], where r is the
-    rank of the input tensor if they are out-of-range (after adding r in the case of
-    negative axis). Thus, specifying any end value > r is equivalent to specifying an end
-    value of r, and specifying any start value < -r is equivalent to specifying a start
-    value of 0.
-    For example:
-    Input tensor with shape: [2, 3, 4]
-    No attributes specified.
+    Takes a tensor as input and outputs an 1D int64 tensor containing the
+    shape of the input tensor. Optional attributes start and end can be used
+    to compute a slice of the input tensor's shape. If start axis is
+    omitted, the slice starts from axis 0. The end axis, if specified, is
+    exclusive (and the returned value will not include the size of that
+    axis). If the end axis is omitted, the axes upto the last one will be
+    included. Negative axes indicate counting back from the last axis. Note
+    that axes will be clamped to the range [0, r-1], where r is the rank of
+    the input tensor if they are out-of-range (after adding r in the case of
+    negative axis). Thus, specifying any end value > r is equivalent to
+    specifying an end value of r, and specifying any start value < -r is
+    equivalent to specifying a start value of 0.
+
+    For example: Input tensor with shape: [2, 3, 4] No attributes specified.
     Output: [2, 3, 4]
-    Input tensor with shape: [2, 3, 4]
-    start: -1
-    Output: [4]
-    Input tensor with shape: [2, 3, 4]
-    end: -1
-    Output: [2, 3]
-    Input tensor with shape: [2, 3, 4]
-    start: 1
-    end: 2
-    Output: [3]
+
+    Input tensor with shape: [2, 3, 4] start: -1 Output: [4]
+
+    Input tensor with shape: [2, 3, 4] end: -1 Output: [2, 3]
+
+    Input tensor with shape: [2, 3, 4] start: 1 end: 2 Output: [3]
 
     Parameters
     ==========
@@ -11972,10 +13521,13 @@ def shape(
         An input tensor.
     end
         Attribute.
-        (Optional) Ending axis for slicing the shape. Negative value means counting dimensions from the back. If omitted, sizes of all axes upto (including) the last one will be included.
+        (Optional) Ending axis for slicing the shape. Negative value means
+        counting dimensions from the back. If omitted, sizes of all axes upto
+        (including) the last one will be included.
     start
         Attribute.
-        (Optional) Starting axis for slicing the shape. Default value is 0.Negative value means counting dimensions from the back.
+        (Optional) Starting axis for slicing the shape. Default value is
+        0.Negative value means counting dimensions from the back.
 
     Returns
     =======
@@ -12009,9 +13561,9 @@ def shrink(
     lambd: float = 0.5,
 ) -> Var:
     r"""
-    Shrink takes one input data (Tensor<numeric>) and produces one Tensor output,
-    having same datatype and shape with input. It has two attributes, lambd and
-    bias. The formula of this operator is: If x < -lambd, y = x + bias;
+    Shrink takes one input data (Tensor) and produces one Tensor output,
+    having same datatype and shape with input. It has two attributes, lambd
+    and bias. The formula of this operator is: If x < -lambd, y = x + bias;
     If x > lambd, y = x - bias; Otherwise, y = 0.
 
     Parameters
@@ -12055,8 +13607,8 @@ def sigmoid(
 ) -> Var:
     r"""
     Sigmoid takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the
-    tensor elementwise.
+    (Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is
+    applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -12089,8 +13641,8 @@ def sign(
     input: Var,
 ) -> Var:
     r"""
-    Calculate the sign of the given input tensor element-wise.
-    If input > 0, output 1. if input < 0, output -1. if input == 0, output 0.
+    Calculate the sign of the given input tensor element-wise. If input > 0,
+    output 1. if input < 0, output -1. if input == 0, output 0.
 
     Parameters
     ==========
@@ -12102,7 +13654,8 @@ def sign(
     =======
     output : Var
         Type T.
-        The sign of the input tensor computed element-wise. It has the same shape and type of the input.
+        The sign of the input tensor computed element-wise. It has the same
+        shape and type of the input.
 
     Notes
     =====
@@ -12189,7 +13742,8 @@ def size(
     data: Var,
 ) -> Var:
     r"""
-    Takes a tensor as input and outputs a int64 scalar that equals to the total number of elements of the input tensor.
+    Takes a tensor as input and outputs a int64 scalar that equals to the
+    total number of elements of the input tensor.
 
     Parameters
     ==========
@@ -12227,51 +13781,48 @@ def slice(
     steps: Optional[Var] = None,
 ) -> Var:
     r"""
-    Produces a slice of the input tensor along multiple axes. Similar to numpy:
+    Produces a slice of the input tensor along multiple axes. Similar to
+    numpy:
     https://numpy.org/doc/stable/user/basics.indexing.html?highlight=slice#slicing-and-striding
-    Slice uses the `starts`, `ends`, `axes` and `steps` inputs to select a sub-tensor
-    of its input `data` tensor.
-    An effective `start[i]`, `end[i]`, and `step[i]` must be computed for each `i`
-    in `[0, ... r-1]` where `r = rank(input)` as follows:
-    If `axes` are omitted, they are set to `[0, ..., r-1]`.
-    If `steps` are omitted, they are set to `[1, ..., 1]` of length `len(starts)`
-    The effective values are initialized as `start[i] = 0`, `end[i] = dims[i]` where
-    `dims` are the dimensions of `input` and `step[i] = `1.
-    All negative elements of `axes` are made non-negatve by adding `r` to them, where
-    `r =rank(input)`.
-    All negative values in `starts[i]` and `ends[i]` have `dims[axes[i]]` added to them,
-    where `dims` are the dimensions of `input`. Then `start[axes[i]]` is the adjusted
-    `starts[i]` is clamped into the range `[0, dims[axes[i]]]` for positive stepping
-    and `[0, dims[axes[i]]-1]` for negative stepping.
-    The clamping for the adjusted `ends[i]` depends on the sign of `steps[i]` and must
-    accommodate copying 0 through `dims[axes[i]]` elements, so for positive stepping
-    `end[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
-    is clamped to `[-1, dims[axes[i]]-1]`.
-    Finally, `step[axes[i]] = steps[i]`.
-    For slicing to the end of a dimension with unknown size, it is recommended to pass
-    in `INT_MAX` when slicing forward and 'INT_MIN' when slicing backward.
-    Example 1:
-      data = [
-          [1, 2, 3, 4],
-          [5, 6, 7, 8],
-      ]
-      axes = [0, 1]
-      starts = [1, 0]
-      ends = [2, 3]
-      steps = [1, 2]
-      result = [
-          [5, 7],
-      ]
-    Example 2:
-      data = [
-          [1, 2, 3, 4],
-          [5, 6, 7, 8],
-      ]
-      starts = [0, 1]
-      ends = [-1, 1000]
-      result = [
-          [2, 3, 4],
-      ]
+
+    Slice uses the ``starts``, ``ends``, ``axes`` and ``steps`` inputs to
+    select a sub-tensor of its input ``data`` tensor.
+
+    An effective ``start[i]``, ``end[i]``, and ``step[i]`` must be computed
+    for each ``i`` in ``[0, ... r-1]`` where ``r = rank(input)`` as follows:
+
+    If ``axes`` are omitted, they are set to ``[0, ..., r-1]``. If ``steps``
+    are omitted, they are set to ``[1, ..., 1]`` of length ``len(starts)``
+
+    The effective values are initialized as ``start[i] = 0``,
+    ``end[i] = dims[i]`` where ``dims`` are the dimensions of ``input`` and
+    ``step[i] =``\ 1.
+
+    All negative elements of ``axes`` are made non-negatve by adding ``r``
+    to them, where ``r =rank(input)``.
+
+    All negative values in ``starts[i]`` and ``ends[i]`` have
+    ``dims[axes[i]]`` added to them, where ``dims`` are the dimensions of
+    ``input``. Then ``start[axes[i]]`` is the adjusted ``starts[i]`` is
+    clamped into the range ``[0, dims[axes[i]]]`` for positive stepping and
+    ``[0, dims[axes[i]]-1]`` for negative stepping.
+
+    The clamping for the adjusted ``ends[i]`` depends on the sign of
+    ``steps[i]`` and must accommodate copying 0 through ``dims[axes[i]]``
+    elements, so for positive stepping ``end[axes[i]]`` is clamped to
+    ``[0, dims[axes[i]]]``, while for negative stepping it is clamped to
+    ``[-1, dims[axes[i]]-1]``.
+
+    Finally, ``step[axes[i]] = steps[i]``.
+
+    For slicing to the end of a dimension with unknown size, it is
+    recommended to pass in ``INT_MAX`` when slicing forward and 'INT_MIN'
+    when slicing backward.
+
+    Example 1: data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] axes = [0, 1] starts =
+    [1, 0] ends = [2, 3] steps = [1, 2] result = [ [5, 7], ] Example 2: data
+    = [ [1, 2, 3, 4], [5, 6, 7, 8], ] starts = [0, 1] ends = [-1, 1000]
+    result = [ [2, 3, 4], ]
 
     Parameters
     ==========
@@ -12280,16 +13831,20 @@ def slice(
         Tensor of data to extract slices from.
     starts
         Type Tind.
-        1-D tensor of starting indices of corresponding axis in `axes`
+        1-D tensor of starting indices of corresponding axis in ``axes``
     ends
         Type Tind.
-        1-D tensor of ending indices (exclusive) of corresponding axis in `axes`
+        1-D tensor of ending indices (exclusive) of corresponding axis in
+        ``axes``
     axes
         Type Tind.
-        1-D tensor of axes that `starts` and `ends` apply to. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data). Behavior is undefined if an axis is repeated.
+        1-D tensor of axes that ``starts`` and ``ends`` apply to. Negative value
+        means counting dimensions from the back. Accepted range is [-r, r-1]
+        where r = rank(data). Behavior is undefined if an axis is repeated.
     steps
         Type Tind.
-        1-D tensor of slice step of corresponding axis in `axes`. Negative value means slicing backward. 'steps' cannot be 0. Defaults to 1s.
+        1-D tensor of slice step of corresponding axis in ``axes``. Negative
+        value means slicing backward. 'steps' cannot be 0. Defaults to 1s.
 
     Returns
     =======
@@ -12323,11 +13878,15 @@ def softmax(
     axis: int = -1,
 ) -> Var:
     r"""
-    The operator computes the normalized exponential values for the given input:
-     Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis, keepdims=1)
-    The "axis" attribute indicates the dimension along which Softmax
-    will be performed. The output tensor has the same shape
-    and contains the Softmax values of the corresponding input.
+    The operator computes the normalized exponential values for the given
+    input:
+
+    Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis,
+    keepdims=1)
+
+    The "axis" attribute indicates the dimension along which Softmax will be
+    performed. The output tensor has the same shape and contains the Softmax
+    values of the corresponding input.
 
     Parameters
     ==========
@@ -12336,9 +13895,9 @@ def softmax(
         The input tensor of rank >= axis.
     axis
         Attribute.
-        Describes the dimension Softmax will be performed on.
-        Negative value means counting dimensions
-        from the back. Accepted range is [-r, r-1] where r = rank(input).
+        Describes the dimension Softmax will be performed on. Negative value
+        means counting dimensions from the back. Accepted range is [-r, r-1]
+        where r = rank(input).
 
     Returns
     =======
@@ -12372,59 +13931,78 @@ def softmax_cross_entropy_loss(
     reduction: str = "mean",
 ) -> Tuple[Var, Var]:
     r"""
-    Loss function that measures the softmax cross entropy
-    between 'scores' and 'labels'.
-    This operator first computes a loss tensor whose shape is identical to the labels input.
-    If the input is 2-D with shape (N, C), the loss tensor may be a N-element vector L = (l_1, l_2, ..., l_N).
-    If the input is N-D tensor with shape (N, C, D1, D2, ..., Dk),
-    the loss tensor L may have (N, D1, D2, ..., Dk) as its shape and L[i,][j_1][j_2]...[j_k] denotes a scalar element in L.
-    After L is available, this operator can optionally do a reduction operator.
-    shape(scores): (N, C) where C is the number of classes, or (N, C, D1, D2,..., Dk),
-            with K >= 1 in case of K-dimensional loss.
-    shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
-            with K >= 1 in case of K-dimensional loss.
+    Loss function that measures the softmax cross entropy between 'scores'
+    and 'labels'. This operator first computes a loss tensor whose shape is
+    identical to the labels input. If the input is 2-D with shape (N, C),
+    the loss tensor may be a N-element vector L = (l_1, l_2, ..., l_N). If
+    the input is N-D tensor with shape (N, C, D1, D2, ..., Dk), the loss
+    tensor L may have (N, D1, D2, ..., Dk) as its shape and
+    L[i,][j_1][j_2]...[j_k] denotes a scalar element in L. After L is
+    available, this operator can optionally do a reduction operator.
+
+    shape(scores): (N, C) where C is the number of classes, or (N, C, D1,
+    D2,..., Dk), with K >= 1 in case of K-dimensional loss. shape(labels):
+    (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
+    with K >= 1 in case of K-dimensional loss.
+
     The loss for one sample, l_i, can caculated as follows:
-        l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
-    or
-        l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk] * weights[c], if 'weights' is provided.
+    l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of
+    classes. or l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk] \* weights[c],
+    if 'weights' is provided.
+
     loss is zero for the case when label-value equals ignore_index.
-        l[i][d1][d2]...[dk]  = 0, when labels[n][d1][d2]...[dk] = ignore_index
-    where:
-        p = Softmax(scores)
-        y = Log(p)
-        c = labels[i][d1][d2]...[dk]
-    Finally, L is optionally reduced:
-    If reduction = 'none', the output is L with shape (N, D1, D2, ..., Dk).
-    If reduction = 'sum', the output is scalar: Sum(L).
-    If reduction = 'mean', the output is scalar: ReduceMean(L), or if weight is provided: ReduceSum(L) / ReduceSum(W),
-    where tensor W is of shape (N, D1, D2, ..., Dk) and W[n][d1][d2]...[dk] = weights[labels[i][d1][d2]...[dk]].
+    l[i][d1][d2]...[dk] = 0, when labels[n][d1][d2]...[dk] = ignore_index
+
+    where: p = Softmax(scores) y = Log(p) c = labels[i][d1][d2]...[dk]
+
+    Finally, L is optionally reduced: If reduction = 'none', the output is L
+    with shape (N, D1, D2, ..., Dk). If reduction = 'sum', the output is
+    scalar: Sum(L). If reduction = 'mean', the output is scalar:
+    ReduceMean(L), or if weight is provided: ReduceSum(L) / ReduceSum(W),
+    where tensor W is of shape (N, D1, D2, ..., Dk) and W[n][d1][d2]...[dk]
+    = weights[labels[i][d1][d2]...[dk]].
 
     Parameters
     ==========
     scores
         Type T.
-        The predicted outputs with shape [batch_size, class_size], or [batch_size, class_size, D1, D2 , ..., Dk], where K is the number of dimensions.
+        The predicted outputs with shape [batch_size, class_size], or
+        [batch_size, class_size, D1, D2 , ..., Dk], where K is the number of
+        dimensions.
     labels
         Type Tind.
-        The ground truth output tensor, with shape [batch_size], or [batch_size, D1, D2, ..., Dk], where K is the number of dimensions. Labels element value shall be in range of [0, C). If ignore_index is specified, it may have a value outside [0, C) and the label values should either be in the range [0, C) or have the value ignore_index.
+        The ground truth output tensor, with shape [batch_size], or [batch_size,
+        D1, D2, ..., Dk], where K is the number of dimensions. Labels element
+        value shall be in range of [0, C). If ignore_index is specified, it may
+        have a value outside [0, C) and the label values should either be in the
+        range [0, C) or have the value ignore_index.
     weights
         Type T.
-        A manual rescaling weight given to each class. If given, it has to be a 1D Tensor assigning weight to each of the classes. Otherwise, it is treated as if having all ones.
+        A manual rescaling weight given to each class. If given, it has to be a
+        1D Tensor assigning weight to each of the classes. Otherwise, it is
+        treated as if having all ones.
     ignore_index
         Attribute.
-        Specifies a target value that is ignored and does not contribute to the input gradient. It's an optional value.
+        Specifies a target value that is ignored and does not contribute to the
+        input gradient. It's an optional value.
     reduction
         Attribute.
-        Type of reduction to apply to loss: none, sum, mean(default). 'none': no reduction will be applied, 'sum': the output will be summed. 'mean': the sum of the output will be divided by the number of elements in the output.
+        Type of reduction to apply to loss: none, sum, mean(default). 'none': no
+        reduction will be applied, 'sum': the output will be summed. 'mean': the
+        sum of the output will be divided by the number of elements in the
+        output.
 
     Returns
     =======
     output : Var
         Type T.
-        Weighted loss float Tensor. If reduction is 'none', this has the shape of [batch_size], or [batch_size, D1, D2, ..., Dk] in case of K-dimensional loss. Otherwise, it is a scalar.
+        Weighted loss float Tensor. If reduction is 'none', this has the shape
+        of [batch_size], or [batch_size, D1, D2, ..., Dk] in case of
+        K-dimensional loss. Otherwise, it is a scalar.
     log_prob : Var
         Type T.
-        Log probability tensor. If the output of softmax is prob, its value is log(prob).
+        Log probability tensor. If the output of softmax is prob, its value is
+        log(prob).
 
     Notes
     =====
@@ -12452,8 +14030,8 @@ def softplus(
 ) -> Var:
     r"""
     Softplus takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the softplus function, y = ln(exp(x) + 1), is applied to
-    the tensor elementwise.
+    (Tensor<T>) where the softplus function, y = ln(exp(x) + 1), is applied
+    to the tensor elementwise.
 
     Parameters
     ==========
@@ -12486,7 +14064,8 @@ def softsign(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the softsign (x/(1+|x|)) of the given input tensor element-wise.
+    Calculates the softsign (x/(1+|x|)) of the given input tensor
+    element-wise.
 
     Parameters
     ==========
@@ -12498,7 +14077,8 @@ def softsign(
     =======
     output : Var
         Type T.
-        The softsign (x/(1+|x|)) values of the input tensor computed element-wise
+        The softsign (x/(1+|x|)) values of the input tensor computed
+        element-wise
 
     Notes
     =====
@@ -12521,15 +14101,16 @@ def space_to_depth(
     blocksize: int,
 ) -> Var:
     r"""
-    SpaceToDepth rearranges blocks of spatial data into depth. More specifically,
-    this op outputs a copy of the input tensor where values from the height and width dimensions
-    are moved to the depth dimension.
+    SpaceToDepth rearranges blocks of spatial data into depth. More
+    specifically, this op outputs a copy of the input tensor where values
+    from the height and width dimensions are moved to the depth dimension.
 
     Parameters
     ==========
     input
         Type T.
-        Input tensor of [N,C,H,W], where N is the batch axis, C is the channel or depth, H is the height and W is the width.
+        Input tensor of [N,C,H,W], where N is the batch axis, C is the channel
+        or depth, H is the height and W is the width.
     blocksize
         Attribute.
         Blocks of [blocksize, blocksize] are moved.
@@ -12538,7 +14119,8 @@ def space_to_depth(
     =======
     output : Var
         Type T.
-        Output tensor of [N, C * blocksize * blocksize, H/blocksize, W/blocksize].
+        Output tensor of [N, C \* blocksize \* blocksize, H/blocksize,
+        W/blocksize].
 
     Notes
     =====
@@ -12565,9 +14147,9 @@ def split(
     axis: int = 0,
 ) -> Sequence[Var]:
     r"""
-    Split a tensor into a list of tensors, along the specified
-    'axis'. Lengths of the parts can be specified using input 'split'.
-    Otherwise, the tensor is split to equal sized parts.
+    Split a tensor into a list of tensors, along the specified 'axis'.
+    Lengths of the parts can be specified using input 'split'. Otherwise,
+    the tensor is split to equal sized parts.
 
     Parameters
     ==========
@@ -12576,10 +14158,12 @@ def split(
         The tensor to split
     split
         Type tensor(int64).
-        Optional length of each output. Values should be >= 0.Sum of the values must be equal to the dim value at 'axis' specified.
+        Optional length of each output. Values should be >= 0.Sum of the values
+        must be equal to the dim value at 'axis' specified.
     axis
         Attribute.
-        Which axis to split on. A negative value means counting dimensions from the back. Accepted range is [-rank, rank-1] where r = rank(input).
+        Which axis to split on. A negative value means counting dimensions from
+        the back. Accepted range is [-rank, rank-1] where r = rank(input).
     outputs_count
         Specifies the number of variadic outputs of this operator.
         Non-standard parameter created by the opset generator, as inference (a solution) it was not implemented or is impossible.
@@ -12617,16 +14201,16 @@ def split_to_sequence(
     keepdims: int = 1,
 ) -> Var:
     r"""
-    Split a tensor into a sequence of tensors, along the specified
-    'axis'. Lengths of the parts can be specified using argument 'split'.
-    'split' must contain only positive numbers.
-    'split' is either a scalar (tensor of empty shape), or a 1-D tensor.
-    If 'split' is a scalar, then 'input' will be split into equally sized chunks(if possible).
-    Last chunk will be smaller if the 'input' size along the given axis 'axis' is not divisible
-    by 'split'.
-    Otherwise, the tensor is split into 'size(split)' chunks, with lengths of the parts on 'axis'
-    specified in 'split'. In this scenario, the sum of entries in 'split' must be equal to the
-    dimension size of input tensor on 'axis'.
+    Split a tensor into a sequence of tensors, along the specified 'axis'.
+    Lengths of the parts can be specified using argument 'split'. 'split'
+    must contain only positive numbers. 'split' is either a scalar (tensor
+    of empty shape), or a 1-D tensor. If 'split' is a scalar, then 'input'
+    will be split into equally sized chunks(if possible). Last chunk will be
+    smaller if the 'input' size along the given axis 'axis' is not divisible
+    by 'split'. Otherwise, the tensor is split into 'size(split)' chunks,
+    with lengths of the parts on 'axis' specified in 'split'. In this
+    scenario, the sum of entries in 'split' must be equal to the dimension
+    size of input tensor on 'axis'.
 
     Parameters
     ==========
@@ -12635,13 +14219,16 @@ def split_to_sequence(
         The tensor to split
     split
         Type I.
-        Length of each output. It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be >= 0.
+        Length of each output. It can be either a scalar(tensor of empty shape),
+        or a 1-D tensor. All values must be >= 0.
     axis
         Attribute.
-        Which axis to split on. A negative value means counting dimensions from the back. Accepted range is [-rank, rank-1].
+        Which axis to split on. A negative value means counting dimensions from
+        the back. Accepted range is [-rank, rank-1].
     keepdims
         Attribute.
-        Keep the split dimension or not. Default 1, which means we keep split dimension. If input 'split' is specified, this attribute is ignored.
+        Keep the split dimension or not. Default 1, which means we keep split
+        dimension. If input 'split' is specified, this attribute is ignored.
 
     Returns
     =======
@@ -12674,9 +14261,9 @@ def sqrt(
     X: Var,
 ) -> Var:
     r"""
-    Square root takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the square root is, y = x^0.5, is applied to
-    the tensor elementwise. If x is negative, then it will return NaN.
+    Square root takes one input data (Tensor<T>) and produces one output
+    data (Tensor<T>) where the square root is, y = x^0.5, is applied to the
+    tensor elementwise. If x is negative, then it will return NaN.
 
     Parameters
     ==========
@@ -12710,10 +14297,11 @@ def squeeze(
     axes: Optional[Var] = None,
 ) -> Var:
     r"""
-    Remove single-dimensional entries from the shape of a tensor.
-    Takes an input `axes` with a list of axes to squeeze.
-    If `axes` is not provided, all the single dimensions will be removed from
-    the shape. If an axis is selected with shape entry not equal to one, an error is raised.
+    Remove single-dimensional entries from the shape of a tensor. Takes an
+    input ``axes`` with a list of axes to squeeze. If ``axes`` is not
+    provided, all the single dimensions will be removed from the shape. If
+    an axis is selected with shape entry not equal to one, an error is
+    raised.
 
     Parameters
     ==========
@@ -12722,7 +14310,9 @@ def squeeze(
         Tensors with at least max(dims) dimensions.
     axes
         Type tensor(int64).
-        List of integers indicating the dimensions to squeeze. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).
+        List of integers indicating the dimensions to squeeze. Negative value
+        means counting dimensions from the back. Accepted range is [-r, r-1]
+        where r = rank(data).
 
     Returns
     =======
@@ -12755,15 +14345,15 @@ def string_normalizer(
     stopwords: Optional[Iterable[str]] = None,
 ) -> Var:
     r"""
-    StringNormalization performs string operations for basic cleaning.
-    This operator has only one input (denoted by X) and only one output
-    (denoted by Y). This operator first examines the elements in the X,
-    and removes elements specified in "stopwords" attribute.
-    After removing stop words, the intermediate result can be further lowercased,
-    uppercased, or just returned depending the "case_change_action" attribute.
-    This operator only accepts [C]- and [1, C]-tensor.
-    If all elements in X are dropped, the output will be the empty value of string tensor with shape [1]
-    if input shape is [C] and shape [1, 1] if input shape is [1, C].
+    StringNormalization performs string operations for basic cleaning. This
+    operator has only one input (denoted by X) and only one output (denoted
+    by Y). This operator first examines the elements in the X, and removes
+    elements specified in "stopwords" attribute. After removing stop words,
+    the intermediate result can be further lowercased, uppercased, or just
+    returned depending the "case_change_action" attribute. This operator
+    only accepts [C]- and [1, C]-tensor. If all elements in X are dropped,
+    the output will be the empty value of string tensor with shape [1] if
+    input shape is [C] and shape [1, 1] if input shape is [1, C].
 
     Parameters
     ==========
@@ -12772,13 +14362,17 @@ def string_normalizer(
         UTF-8 strings to normalize
     case_change_action
         Attribute.
-        string enum that cases output to be lowercased/uppercases/unchanged. Valid values are "LOWER", "UPPER", "NONE". Default is "NONE"
+        string enum that cases output to be lowercased/uppercases/unchanged.
+        Valid values are "LOWER", "UPPER", "NONE". Default is "NONE"
     is_case_sensitive
         Attribute.
-        Boolean. Whether the identification of stop words in X is case-sensitive. Default is false
+        Boolean. Whether the identification of stop words in X is
+        case-sensitive. Default is false
     locale
         Attribute.
-        Environment dependent string that denotes the locale according to which output strings needs to be upper/lowercased.Default en_US or platform specific equivalent as decided by the implementation.
+        Environment dependent string that denotes the locale according to which
+        output strings needs to be upper/lowercased.Default en_US or platform
+        specific equivalent as decided by the implementation.
     stopwords
         Attribute.
         List of stop words. If not set, no word would be removed from X.
@@ -12812,9 +14406,15 @@ def sub(
     B: Var,
 ) -> Var:
     r"""
-    Performs element-wise binary subtraction (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
-    (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
+    Performs element-wise binary subtraction (with Numpy-style broadcasting
+    support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+
+    (Opset 14 change): Extend supported types to include uint8, int8,
+    uint16, and int16.
 
     Parameters
     ==========
@@ -12851,9 +14451,11 @@ def sum(
     data_0: Sequence[Var],
 ) -> Var:
     r"""
-    Element-wise sum of each of the input tensors (with Numpy-style broadcasting support).
-    All inputs and outputs must have the same data type.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Element-wise sum of each of the input tensors (with Numpy-style
+    broadcasting support). All inputs and outputs must have the same data
+    type. This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
@@ -12919,7 +14521,8 @@ def tanh(
     input: Var,
 ) -> Var:
     r"""
-    Calculates the hyperbolic tangent of the given input tensor element-wise.
+    Calculates the hyperbolic tangent of the given input tensor
+    element-wise.
 
     Parameters
     ==========
@@ -12962,29 +14565,47 @@ def tf_idf_vectorizer(
     weights: Optional[Iterable[float]] = None,
 ) -> Var:
     r"""
-    This transform extracts n-grams from the input sequence and save them as a vector. Input can
-    be either a 1-D or 2-D tensor. For 1-D input, output is the n-gram representation of that input.
-    For 2-D input, the output is also a  2-D tensor whose i-th row is the n-gram representation of the i-th input row.
-    More specifically, if input shape is [C], the corresponding output shape would be [max(ngram_indexes) + 1].
-    If input shape is [N, C], this operator produces a [N, max(ngram_indexes) + 1]-tensor.
-    In contrast to standard n-gram extraction, here, the indexes of extracting an n-gram from the original
-    sequence are not necessarily consecutive numbers. The discontinuity between indexes are controlled by the number of skips.
-    If the number of skips is 2, we should skip two tokens when scanning through the original sequence.
-    Let's consider an example. Assume that input sequence is [94, 17, 36, 12, 28] and the number of skips is 2.
-    The associated 2-grams are [94, 12] and [17, 28] respectively indexed by [0, 3] and [1, 4].
-    If the number of skips becomes 0, the 2-grams generated are [94, 17], [17, 36], [36, 12], [12, 28]
-    indexed by [0, 1], [1, 2], [2, 3], [3, 4], respectively.
+    This transform extracts n-grams from the input sequence and save them as
+    a vector. Input can be either a 1-D or 2-D tensor. For 1-D input, output
+    is the n-gram representation of that input. For 2-D input, the output is
+    also a 2-D tensor whose i-th row is the n-gram representation of the
+    i-th input row. More specifically, if input shape is [C], the
+    corresponding output shape would be [max(ngram_indexes) + 1]. If input
+    shape is [N, C], this operator produces a [N, max(ngram_indexes) +
+    1]-tensor.
+
+    In contrast to standard n-gram extraction, here, the indexes of
+    extracting an n-gram from the original sequence are not necessarily
+    consecutive numbers. The discontinuity between indexes are controlled by
+    the number of skips. If the number of skips is 2, we should skip two
+    tokens when scanning through the original sequence. Let's consider an
+    example. Assume that input sequence is [94, 17, 36, 12, 28] and the
+    number of skips is 2. The associated 2-grams are [94, 12] and [17, 28]
+    respectively indexed by [0, 3] and [1, 4]. If the number of skips
+    becomes 0, the 2-grams generated are [94, 17], [17, 36], [36, 12], [12,
+    28] indexed by [0, 1], [1, 2], [2, 3], [3, 4], respectively.
+
     The output vector (denoted by Y) stores the count of each n-gram;
-    Y[ngram_indexes[i]] indicates the times that the i-th n-gram is found. The attribute ngram_indexes is used to determine the mapping
-    between index i and the corresponding n-gram's output coordinate. If pool_int64s is [94, 17, 17, 36], ngram_indexes is [1, 0],
-    ngram_counts=[0, 0], then the Y[0] (first element in Y) and Y[1] (second element in Y) are the counts of [17, 36] and [94, 17],
-    respectively. An n-gram which cannot be found in pool_strings/pool_int64s should be ignored and has no effect on the output.
-    Note that we may consider all skips up to S when generating the n-grams.
-    The examples used above are true if mode is "TF". If mode is "IDF", all the counts larger than 1 would be truncated to 1 and
-    the i-th element in weights would be used to scale (by multiplication) the count of the i-th n-gram in pool. If mode is "TFIDF",
-    this operator first computes the counts of all n-grams and then scale them by the associated values in the weights attribute.
-    Only one of pool_strings and pool_int64s can be set. If pool_int64s is set, the input should be an integer tensor.
-    If pool_strings is set, the input must be a string tensor.
+    Y[ngram_indexes[i]] indicates the times that the i-th n-gram is found.
+    The attribute ngram_indexes is used to determine the mapping between
+    index i and the corresponding n-gram's output coordinate. If pool_int64s
+    is [94, 17, 17, 36], ngram_indexes is [1, 0], ngram_counts=[0, 0], then
+    the Y[0] (first element in Y) and Y[1] (second element in Y) are the
+    counts of [17, 36] and [94, 17], respectively. An n-gram which cannot be
+    found in pool_strings/pool_int64s should be ignored and has no effect on
+    the output. Note that we may consider all skips up to S when generating
+    the n-grams.
+
+    The examples used above are true if mode is "TF". If mode is "IDF", all
+    the counts larger than 1 would be truncated to 1 and the i-th element in
+    weights would be used to scale (by multiplication) the count of the i-th
+    n-gram in pool. If mode is "TFIDF", this operator first computes the
+    counts of all n-grams and then scale them by the associated values in
+    the weights attribute.
+
+    Only one of pool_strings and pool_int64s can be set. If pool_int64s is
+    set, the input should be an integer tensor. If pool_strings is set, the
+    input must be a string tensor.
 
     Parameters
     ==========
@@ -12993,31 +14614,57 @@ def tf_idf_vectorizer(
         Input for n-gram extraction
     max_gram_length
         Attribute.
-        Maximum n-gram length. If this value is 3, 3-grams will be used to generate the output.
+        Maximum n-gram length. If this value is 3, 3-grams will be used to
+        generate the output.
     max_skip_count
         Attribute.
-        Maximum number of items (integers/strings) to be skipped when constructing an n-gram from X. If max_skip_count=1, min_gram_length=2, max_gram_length=3, this operator may generate 2-grams with skip_count=0 and skip_count=1, and 3-grams with skip_count=0 and skip_count=1
+        Maximum number of items (integers/strings) to be skipped when
+        constructing an n-gram from X. If max_skip_count=1, min_gram_length=2,
+        max_gram_length=3, this operator may generate 2-grams with skip_count=0
+        and skip_count=1, and 3-grams with skip_count=0 and skip_count=1
     min_gram_length
         Attribute.
-        Minimum n-gram length. If this value is 2 and max_gram_length is 3, output may contain counts of 2-grams and 3-grams.
+        Minimum n-gram length. If this value is 2 and max_gram_length is 3,
+        output may contain counts of 2-grams and 3-grams.
     mode
         Attribute.
-        The weighting criteria. It can be one of "TF" (term frequency), "IDF" (inverse document frequency), and "TFIDF" (the combination of TF and IDF)
+        The weighting criteria. It can be one of "TF" (term frequency), "IDF"
+        (inverse document frequency), and "TFIDF" (the combination of TF and
+        IDF)
     ngram_counts
         Attribute.
-        The starting indexes of 1-grams, 2-grams, and so on in pool. It is useful when determining the boundary between two consecutive collections of n-grams. For example, if ngram_counts is [0, 17, 36], the first index (zero-based) of 1-gram/2-gram/3-gram in pool are 0/17/36. This format is essentially identical to CSR (or CSC) sparse matrix format, and we choose to use this due to its popularity.
+        The starting indexes of 1-grams, 2-grams, and so on in pool. It is
+        useful when determining the boundary between two consecutive collections
+        of n-grams. For example, if ngram_counts is [0, 17, 36], the first index
+        (zero-based) of 1-gram/2-gram/3-gram in pool are 0/17/36. This format is
+        essentially identical to CSR (or CSC) sparse matrix format, and we
+        choose to use this due to its popularity.
     ngram_indexes
         Attribute.
-        list of int64s (type: AttributeProto::INTS). This list is parallel to the specified 'pool_*' attribute. The i-th element in ngram_indexes indicate the coordinate of the i-th n-gram in the output tensor.
+        list of int64s (type: AttributeProto::INTS). This list is parallel to
+        the specified 'pool_*' attribute. The i-th element in ngram_indexes
+        indicate the coordinate of the i-th n-gram in the output tensor.
     pool_int64s
         Attribute.
-        List of int64 n-grams learned from the training set. Either this or pool_strings attributes must be present but not both. It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams. The i-th element in pool stores the n-gram that should be mapped to coordinate ngram_indexes[i] in the output vector.
+        List of int64 n-grams learned from the training set. Either this or
+        pool_strings attributes must be present but not both. It's an 1-D tensor
+        starting with the collections of all 1-grams and ending with the
+        collections of n-grams. The i-th element in pool stores the n-gram that
+        should be mapped to coordinate ngram_indexes[i] in the output vector.
     pool_strings
         Attribute.
-        List of strings n-grams learned from the training set. Either this or pool_int64s attributes must be present but not both. It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams. The i-th element in pool stores the n-gram that should be mapped to coordinate ngram_indexes[i] in the output vector.
+        List of strings n-grams learned from the training set. Either this or
+        pool_int64s attributes must be present but not both. It's an 1-D tensor
+        starting with the collections of all 1-grams and ending with the
+        collections of n-grams. The i-th element in pool stores the n-gram that
+        should be mapped to coordinate ngram_indexes[i] in the output vector.
     weights
         Attribute.
-        list of floats. This attribute stores the weight of each n-gram in pool. The i-th element in weights is the weight of the i-th n-gram in pool. Its length equals to the size of ngram_indexes. By default, weights is an all-one tensor.This attribute is used when mode is "IDF" or "TFIDF" to scale the associated word counts.
+        list of floats. This attribute stores the weight of each n-gram in pool.
+        The i-th element in weights is the weight of the i-th n-gram in pool.
+        Its length equals to the size of ngram_indexes. By default, weights is
+        an all-one tensor.This attribute is used when mode is "IDF" or "TFIDF"
+        to scale the associated word counts.
 
     Returns
     =======
@@ -13057,9 +14704,9 @@ def thresholded_relu(
     alpha: float = 1.0,
 ) -> Var:
     r"""
-    ThresholdedRelu takes one input data (Tensor<T>) and produces one output data
-    (Tensor<T>) where the rectified linear function, y = x for x > alpha, y = 0 otherwise,
-    is applied to the tensor elementwise.
+    ThresholdedRelu takes one input data (Tensor<T>) and produces one output
+    data (Tensor<T>) where the rectified linear function, y = x for x >
+    alpha, y = 0 otherwise, is applied to the tensor elementwise.
 
     Parameters
     ==========
@@ -13098,9 +14745,9 @@ def tile(
     repeats: Var,
 ) -> Var:
     r"""
-    Constructs a tensor by tiling a given tensor.
-    This is the same as function `tile` in Numpy, but no broadcast.
-    For example A = [[1, 2], [3, 4]], B = [1, 2], tile(A, B) = [[1, 2, 1, 2], [3, 4, 3, 4]]
+    Constructs a tensor by tiling a given tensor. This is the same as
+    function ``tile`` in Numpy, but no broadcast. For example A = [[1, 2],
+    [3, 4]], B = [1, 2], tile(A, B) = [[1, 2, 1, 2], [3, 4, 3, 4]]
 
     Parameters
     ==========
@@ -13109,13 +14756,15 @@ def tile(
         Input tensor of any shape.
     repeats
         Type T1.
-        1D int64 tensor of the same length as input's dimension number, includes numbers of repeated copies along input's dimensions.
+        1D int64 tensor of the same length as input's dimension number, includes
+        numbers of repeated copies along input's dimensions.
 
     Returns
     =======
     output : Var
         Type T.
-        Output tensor of the same dimensions and type as tensor input. output_dim[i] = input_dim[i] * repeats[i]
+        Output tensor of the same dimensions and type as tensor input.
+        output_dim[i] = input_dim[i] \* repeats[i]
 
     Notes
     =====
@@ -13143,18 +14792,22 @@ def top_k(
     sorted: int = 1,
 ) -> Tuple[Var, Var]:
     r"""
-    Retrieve the top-K largest or smallest elements along a specified axis. Given an input tensor of
-    shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
-      -Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
-        which contains the values of the top k elements along the specified axis
-      -Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
-       contains the indices of the top k elements (original indices from the input
-       tensor).
-    If "largest" is 1 (the default value) then the k largest elements are returned.
-    If "sorted" is 1 (the default value) then the resulting k elements will be sorted.
-    If "sorted" is 0, order of returned 'Values' and 'Indices' are undefined.
-    Given two equivalent values, this operator uses the indices along the axis as
-     a tiebreaker. That is, the element with the lower index will appear first.
+    Retrieve the top-K largest or smallest elements along a specified axis.
+    Given an input tensor of shape [a_1, a_2, ..., a_n, r] and integer
+    argument k, return two outputs: -Value tensor of shape [a_1, a_2, ...,
+    a_{axis-1}, k, a_{axis+1}, ... a_n] which contains the values of the top
+    k elements along the specified axis -Index tensor of shape [a_1, a_2,
+    ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which contains the indices of
+    the top k elements (original indices from the input tensor).
+
+    If "largest" is 1 (the default value) then the k largest elements are
+    returned. If "sorted" is 1 (the default value) then the resulting k
+    elements will be sorted. If "sorted" is 0, order of returned 'Values'
+    and 'Indices' are undefined.
+
+    Given two equivalent values, this operator uses the indices along the
+    axis as a tiebreaker. That is, the element with the lower index will
+    appear first.
 
     Parameters
     ==========
@@ -13163,10 +14816,13 @@ def top_k(
         Tensor of shape [a_1, a_2, ..., a_n, r]
     K
         Type tensor(int64).
-        A 1-D tensor containing a single positive value corresponding to the number of top elements to retrieve
+        A 1-D tensor containing a single positive value corresponding to the
+        number of top elements to retrieve
     axis
         Attribute.
-        Dimension on which to do the sort. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        Dimension on which to do the sort. Negative value means counting
+        dimensions from the back. Accepted range is [-r, r-1] where r =
+        rank(input).
     largest
         Attribute.
         Whether to return the top-K largest or smallest elements.
@@ -13178,10 +14834,12 @@ def top_k(
     =======
     Values : Var
         Type T.
-        Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] containing top K values from the input tensor
+        Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+        containing top K values from the input tensor
     Indices : Var
         Type I.
-        Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] containing the corresponding input tensor indices for the top K values.
+        Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+        containing the corresponding input tensor indices for the top K values.
 
     Notes
     =====
@@ -13211,8 +14869,8 @@ def transpose(
 ) -> Var:
     r"""
     Transpose the input tensor similar to numpy.transpose. For example, when
-    perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-    will be (2, 1, 3).
+    perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output
+    shape will be (2, 1, 3).
 
     Parameters
     ==========
@@ -13221,7 +14879,8 @@ def transpose(
         An input tensor.
     perm
         Attribute.
-        A list of integers. By default, reverse the dimensions, otherwise permute the axes according to the values given.
+        A list of integers. By default, reverse the dimensions, otherwise
+        permute the axes according to the values given.
 
     Returns
     =======
@@ -13253,18 +14912,23 @@ def trilu(
     upper: int = 1,
 ) -> Var:
     r"""
-    Given a 2-D matrix or batches of 2-D matrices, returns the upper or lower triangular part of the tensor(s).
-    The attribute "upper" determines whether the upper or lower part is retained. If set to true,
-    the upper triangular matrix is retained. Lower triangular matrix is retained otherwise.
-    Default value for the "upper" attribute is true.
-    Trilu takes one input tensor of shape [*, N, M], where * is zero or more batch dimensions. The upper triangular part consists
-    of the elements on and above the given diagonal (k). The lower triangular part consists of elements on and below the diagonal.
-    All other elements in the matrix are set to zero.
-    If k = 0, the triangular part on and above/below the main diagonal is retained.
-    If upper is set to true, a positive k retains the upper triangular matrix excluding the main diagonal and (k-1) diagonals above it.
-    A negative k value retains the main diagonal and |k| diagonals below it.
-    If upper is set to false, a positive k retains the lower triangular matrix including the main diagonal and k diagonals above it.
-    A negative k value excludes the main diagonal and (|k|-1) diagonals below it.
+    Given a 2-D matrix or batches of 2-D matrices, returns the upper or
+    lower triangular part of the tensor(s). The attribute "upper" determines
+    whether the upper or lower part is retained. If set to true, the upper
+    triangular matrix is retained. Lower triangular matrix is retained
+    otherwise. Default value for the "upper" attribute is true. Trilu takes
+    one input tensor of shape [*, N, M], where \* is zero or more batch
+    dimensions. The upper triangular part consists of the elements on and
+    above the given diagonal (k). The lower triangular part consists of
+    elements on and below the diagonal. All other elements in the matrix are
+    set to zero. If k = 0, the triangular part on and above/below the main
+    diagonal is retained. If upper is set to true, a positive k retains the
+    upper triangular matrix excluding the main diagonal and (k-1) diagonals
+    above it. A negative k value retains the main diagonal and \|k\|
+    diagonals below it. If upper is set to false, a positive k retains the
+    lower triangular matrix including the main diagonal and k diagonals
+    above it. A negative k value excludes the main diagonal and (|k|-1)
+    diagonals below it.
 
     Parameters
     ==========
@@ -13273,10 +14937,13 @@ def trilu(
         Input tensor of rank 2 or higher.
     k
         Type tensor(int64).
-        A 0-D tensor containing a single value corresponding to the number diagonals above or below the main diagonal to exclude or include. Default value is 0 if it's not specified.
+        A 0-D tensor containing a single value corresponding to the number
+        diagonals above or below the main diagonal to exclude or include.
+        Default value is 0 if it's not specified.
     upper
         Attribute.
-        Boolean. Indicates whether upper or lower part of matrix is retained. Default is true.
+        Boolean. Indicates whether upper or lower part of matrix is retained.
+        Default is true.
 
     Returns
     =======
@@ -13309,66 +14976,62 @@ def unique(
     sorted: int = 1,
 ) -> Tuple[Var, Var, Var, Var]:
     r"""
-    Find the unique elements of a tensor. When an optional attribute 'axis' is provided, unique subtensors sliced along the 'axis' are returned.
-    Otherwise the input tensor is flattened and unique values of the flattened tensor are returned.
-    This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs.
-    The first output tensor 'Y' contains all unique values or subtensors of the input.
-    The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'..
-    The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ".
-    The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input.
-    Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input.
+    Find the unique elements of a tensor. When an optional attribute 'axis'
+    is provided, unique subtensors sliced along the 'axis' are returned.
+    Otherwise the input tensor is flattened and unique values of the
+    flattened tensor are returned.
+
+    This operator returns the unique values or sliced unique subtensors of
+    the input tensor and three optional outputs. The first output tensor 'Y'
+    contains all unique values or subtensors of the input. The second
+    optional output tensor 'indices' contains indices of 'Y' elements' first
+    occurance in 'X'.. The third optional output tensor 'inverse_indices'
+    contains, for elements of 'X', its corresponding indices in 'Y'. ". The
+    fourth optional output tensor 'counts' contains the count of each
+    element of 'Y' in the input.
+
+    Outputs are either sorted in ascending order or optionally in the order
+    of the first occurrence of the values in the input.
+
     https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
-    Example 1:
-      input_X = [2, 1, 1, 3, 4, 3]
-      attribute_sorted = 0
-      attribute_axis = None
-      output_Y = [2, 1, 3, 4]
-      output_indices = [0, 1, 3, 4]
-      output_inverse_indices = [0, 1, 1, 2, 3, 2]
-      output_counts = [1, 2, 2, 1]
-    Example 2:
-      input_X = [[1, 3], [2, 3]]
-      attribute_sorted = 1
-      attribute_axis = None
-      output_Y = [1, 2, 3]
-      output_indices = [0, 2, 1]
-      output_inverse_indices = [0, 2, 1, 2]
-      output_counts = [1, 1, 2]
-    Example 3:
-      input_X = [[1, 0, 0], [1, 0, 0], [2, 3, 4]]
-      attribute_sorted = 1
-      attribute_axis = 0
-      output_Y = [[1, 0, 0], [2, 3, 4]]
-      output_indices = [0, 2]
-      output_inverse_indices = [0, 0, 1]
-      output_counts = [2, 1]
-    Example 4:
-      input_x = [[[1., 1.], [0., 1.], [2., 1.], [0., 1.]],
-                 [[1., 1.], [0., 1.], [2., 1.], [0., 1.]]]
-      attribute_sorted = 1
-      attribute_axis = 1
-      intermediate data are presented below for better understanding:
-      there are 4 subtensors sliced along axis 1 of input_x (shape = (2, 4, 2)):
-      A: [[1, 1], [1, 1]],
-         [[0, 1], [0, 1]],
-         [[2, 1], [2, 1]],
-         [[0, 1], [0, 1]].
-      there are 3 unique subtensors:
-      [[1, 1], [1, 1]],
-      [[0, 1], [0, 1]],
-      [[2, 1], [2, 1]].
-      sorted unique subtensors:
-      B: [[0, 1], [0, 1]],
-         [[1, 1], [1, 1]],
-         [[2, 1], [2, 1]].
-      output_Y is constructed from B:
-      [[[0. 1.], [1. 1.], [2. 1.]],
-       [[0. 1.], [1. 1.], [2. 1.]]]
-      output_indices is to map from B to A:
-      [1, 0, 2]
-      output_inverse_indices is to map from A to B:
-      [1, 0, 2, 0]
-      output_counts = [2 1 1]
+
+    Example 1: input_X = [2, 1, 1, 3, 4, 3] attribute_sorted = 0
+    attribute_axis = None output_Y = [2, 1, 3, 4] output_indices = [0, 1, 3,
+    4] output_inverse_indices = [0, 1, 1, 2, 3, 2] output_counts = [1, 2, 2,
+    1]
+
+    Example 2: input_X = [[1, 3], [2, 3]] attribute_sorted = 1
+    attribute_axis = None output_Y = [1, 2, 3] output_indices = [0, 2, 1]
+    output_inverse_indices = [0, 2, 1, 2] output_counts = [1, 1, 2]
+
+    Example 3: input_X = [[1, 0, 0], [1, 0, 0], [2, 3, 4]] attribute_sorted
+    = 1 attribute_axis = 0 output_Y = [[1, 0, 0], [2, 3, 4]] output_indices
+    = [0, 2] output_inverse_indices = [0, 0, 1] output_counts = [2, 1]
+
+    Example 4: input_x = [[[1., 1.], [0., 1.], [2., 1.], [0., 1.]], [[1.,
+    1.], [0., 1.], [2., 1.], [0., 1.]]] attribute_sorted = 1 attribute_axis
+    = 1
+
+    intermediate data are presented below for better understanding:
+
+    there are 4 subtensors sliced along axis 1 of input_x (shape = (2, 4,
+    2)): A: [[1, 1], [1, 1]], [[0, 1], [0, 1]], [[2, 1], [2, 1]], [[0, 1],
+    [0, 1]].
+
+    there are 3 unique subtensors: [[1, 1], [1, 1]], [[0, 1], [0, 1]], [[2,
+    1], [2, 1]].
+
+    sorted unique subtensors: B: [[0, 1], [0, 1]], [[1, 1], [1, 1]], [[2,
+    1], [2, 1]].
+
+    output_Y is constructed from B: [[[0. 1.], [1. 1.], [2. 1.]], [[0. 1.],
+    [1. 1.], [2. 1.]]]
+
+    output_indices is to map from B to A: [1, 0, 2]
+
+    output_inverse_indices is to map from A to B: [1, 0, 2, 0]
+
+    output_counts = [2 1 1]
 
     Parameters
     ==========
@@ -13377,25 +15040,38 @@ def unique(
         A N-D input tensor that is to be processed.
     axis
         Attribute.
-        (Optional) The dimension to apply unique. If not specified, the unique elements of the flattened input are returned. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).
+        (Optional) The dimension to apply unique. If not specified, the unique
+        elements of the flattened input are returned. Negative value means
+        counting dimensions from the back. Accepted range is [-r, r-1] where r =
+        rank(input).
     sorted
         Attribute.
-        (Optional) Whether to sort the unique elements in ascending order before returning as output. Must be one of 0, or 1 (default).
+        (Optional) Whether to sort the unique elements in ascending order before
+        returning as output. Must be one of 0, or 1 (default).
 
     Returns
     =======
     Y : Var
         Type T.
-        A tensor of the same type as 'X' containing all the unique values or subtensors sliced along a provided 'axis' in 'X', either sorted or maintained in the same order they occur in input 'X'
+        A tensor of the same type as 'X' containing all the unique values or
+        subtensors sliced along a provided 'axis' in 'X', either sorted or
+        maintained in the same order they occur in input 'X'
     indices : Var
         Type tensor(int64).
-        A 1-D INT64 tensor containing indices of 'Y' elements' first occurance in 'X'. When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. When 'axis' is not provided, it contains indices to values in the flattened input tensor.
+        A 1-D INT64 tensor containing indices of 'Y' elements' first occurance
+        in 'X'. When 'axis' is provided, it contains indices to subtensors in
+        input 'X' on the 'axis'. When 'axis' is not provided, it contains
+        indices to values in the flattened input tensor.
     inverse_indices : Var
         Type tensor(int64).
-        A 1-D INT64 tensor containing, for elements of 'X', its corresponding indices in 'Y'. When 'axis' is provided, it contains indices to subtensors in output 'Y' on the 'axis'. When 'axis' is not provided, it contains indices to values in output 'Y'.
+        A 1-D INT64 tensor containing, for elements of 'X', its corresponding
+        indices in 'Y'. When 'axis' is provided, it contains indices to
+        subtensors in output 'Y' on the 'axis'. When 'axis' is not provided, it
+        contains indices to values in output 'Y'.
     counts : Var
         Type tensor(int64).
-        A 1-D INT64 tensor containing the count of each element of 'Y' in input 'X'
+        A 1-D INT64 tensor containing the count of each element of 'Y' in input
+        'X'
 
     Notes
     =====
@@ -13420,15 +15096,21 @@ def unsqueeze(
     axes: Var,
 ) -> Var:
     r"""
-    Insert single-dimensional entries to the shape of an input tensor (`data`).
-    Takes one required input `axes` - which contains a list of dimension indices and this operator will insert a dimension of value `1` into the corresponding index of the output tensor (`expanded`).
-    For example:
-      Given an input tensor (`data`) of shape [3, 4, 5], then
-      Unsqueeze(data, axes=[0, 4]) outputs a tensor (`expanded`) containing same data as `data` but with shape [1, 3, 4, 5, 1].
-    The input `axes` should not contain any duplicate entries. It is an error if it contains duplicates.
-    The rank of the output tensor (`output_rank`) is the rank of the input tensor (`data`) plus the number of values in `axes`.
-    Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1].
-    The order of values in `axes` does not matter and can come in any order.
+    Insert single-dimensional entries to the shape of an input tensor
+    (``data``). Takes one required input ``axes`` - which contains a list of
+    dimension indices and this operator will insert a dimension of value
+    ``1`` into the corresponding index of the output tensor (``expanded``).
+
+    For example: Given an input tensor (``data``) of shape [3, 4, 5], then
+    Unsqueeze(data, axes=[0, 4]) outputs a tensor (``expanded``) containing
+    same data as ``data`` but with shape [1, 3, 4, 5, 1].
+
+    The input ``axes`` should not contain any duplicate entries. It is an
+    error if it contains duplicates. The rank of the output tensor
+    (``output_rank``) is the rank of the input tensor (``data``) plus the
+    number of values in ``axes``. Each value in ``axes`` should be within
+    the (inclusive) range [-output_rank , output_rank - 1]. The order of
+    values in ``axes`` does not matter and can come in any order.
 
     Parameters
     ==========
@@ -13437,7 +15119,9 @@ def unsqueeze(
         Original tensor
     axes
         Type tensor(int64).
-        List of integers indicating the dimensions to be inserted. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(expanded).
+        List of integers indicating the dimensions to be inserted. Negative
+        value means counting dimensions from the back. Accepted range is [-r,
+        r-1] where r = rank(expanded).
 
     Returns
     =======
@@ -13468,9 +15152,8 @@ def upsample(
     mode: str = "nearest",
 ) -> Var:
     r"""
-    Upsample the input tensor.
-    Each dimension value of the output tensor is:
-      output_dimension = floor(input_dimension * scale).
+    Upsample the input tensor. Each dimension value of the output tensor is:
+    output_dimension = floor(input_dimension \* scale).
 
     Parameters
     ==========
@@ -13479,10 +15162,13 @@ def upsample(
         N-D tensor
     scales
         Type tensor(float).
-        The scale array along each dimension. It takes value greater than or equal to 1. The number of elements of 'scales' should be the same as the rank of input 'X'.
+        The scale array along each dimension. It takes value greater than or
+        equal to 1. The number of elements of 'scales' should be the same as the
+        rank of input 'X'.
     mode
         Attribute.
-        Two interpolation modes: nearest (default), and linear (including bilinear, trilinear, etc)
+        Two interpolation modes: nearest (default), and linear (including
+        bilinear, trilinear, etc)
 
     Returns
     =======
@@ -13514,13 +15200,19 @@ def where(
     Y: Var,
 ) -> Var:
     r"""
-    Return elements, either from X or Y, depending on condition.
-    Where behaves like
-    numpy.where (https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html)
+    Return elements, either from X or Y, depending on condition. Where
+    behaves like
+    `numpy.where <https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html>`__
     with three parameters.
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
+
     **History**
-    - Version 16 adds bfloat16 to the types allowed (for the second and third parameter).
+
+    -  Version 16 adds bfloat16 to the types allowed (for the second and
+       third parameter).
 
     Parameters
     ==========
@@ -13563,9 +15255,13 @@ def xor(
     B: Var,
 ) -> Var:
     r"""
-    Returns the tensor resulted from performing the `xor` logical operation
-    elementwise on the input tensors `A` and `B` (with Numpy-style broadcasting support).
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check the doc (Broadcasting.md).
+    Returns the tensor resulted from performing the ``xor`` logical
+    operation elementwise on the input tensors ``A`` and ``B`` (with
+    Numpy-style broadcasting support).
+
+    This operator supports **multidirectional (i.e., Numpy-style)
+    broadcasting**; for more details please check `the
+    doc <https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md>`__.
 
     Parameters
     ==========
diff --git a/src/templates/constructor.jinja2 b/src/templates/constructor.jinja2
index 62f26e4c..63bf2cb8 100644
--- a/src/templates/constructor.jinja2
+++ b/src/templates/constructor.jinja2
@@ -20,11 +20,11 @@ endfor %}
     %}{{ attr.name }}_input_types: Iterable[Type], {% endif %}
 {% endfor %}
 ) -> {{ get_constructor_return(schema) }}:
-{% filter indent(width=4) %}
 {% if gen_docstrings %}
     r"""
 {%+ include "docstring.jinja2" %}
     """
 {% endif %}
-{%+ include "construct.jinja2" %}
+{% filter indent(width=4) %}
+    {%+ include "construct.jinja2" %}
 {% endfilter %}