diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 340b8e607e3..bf0211e8e26 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -227,6 +227,7 @@ jobs:
 
     - name: Free space
       uses: jlumbroso/free-disk-space@main
+      continue-on-error: true
       with:
         tool-cache: true
         android: true
@@ -305,6 +306,7 @@ jobs:
     steps:
     - name: Free space
       uses: jlumbroso/free-disk-space@main
+      continue-on-error: true
       with:
         tool-cache: true
         android: true
@@ -335,6 +337,7 @@ jobs:
     steps:
     - name: Free space
       uses: jlumbroso/free-disk-space@main
+      continue-on-error: true
       with:
         tool-cache: true
         android: true
@@ -375,6 +378,7 @@ jobs:
     steps:
     - name: Free space
       uses: jlumbroso/free-disk-space@main
+      continue-on-error: true
       with:
         tool-cache: true
         android: true
@@ -493,6 +497,7 @@ jobs:
     steps:
     - name: Free space
       uses: jlumbroso/free-disk-space@main
+      continue-on-error: true
       with:
         tool-cache: true
         android: true
diff --git a/.gitignore b/.gitignore
index 92697fb2dd2..57ae0b01788 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,3 +80,6 @@ docs/html
 cmake-build*/
 build*/
 
+# Recommended location to install rbuild dependencies from README.md
+depend
+
diff --git a/rbuild.ini b/rbuild.ini
index 4c0973f265f..3eb2fef6247 100755
--- a/rbuild.ini
+++ b/rbuild.ini
@@ -29,3 +29,12 @@ define =
     CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
     MIGRAPHX_ENABLE_CPU=On
     BUILD_DEV=On
+
+[cibuild]
+cxx = ${rocm_path}/llvm/bin/clang++
+cc = ${rocm_path}/llvm/bin/clang
+deps =
+    -f dev-requirements.txt
+define =
+    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
+    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f22d89aa04f..a00242f4de8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -197,6 +197,7 @@ register_migraphx_ops(
     reduce_sum
     relu
     reshape
+    reshape_lazy
     reverse
     rnn
     rnn_last_cell_output
diff --git a/src/auto_contiguous.cpp b/src/auto_contiguous.cpp
index 39ccc4c184e..8ccb8739776 100644
--- a/src/auto_contiguous.cpp
+++ b/src/auto_contiguous.cpp
@@ -25,7 +25,6 @@
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
-
 #include <migraphx/iterator_for.hpp>
 
 namespace migraphx {
diff --git a/src/include/migraphx/instruction.hpp b/src/include/migraphx/instruction.hpp
index d9fe7794aab..377804e7b75 100644
--- a/src/include/migraphx/instruction.hpp
+++ b/src/include/migraphx/instruction.hpp
@@ -81,6 +81,7 @@ struct MIGRAPHX_EXPORT instruction
 
     const std::vector<module_ref>& module_inputs() const;
 
+    /// Where this instruction is used as an input to another instruction
     const std::vector<instruction_ref>& outputs() const;
 
     friend bool operator==(const instruction& x, const instruction& y);
diff --git a/src/include/migraphx/op/reshape.hpp b/src/include/migraphx/op/reshape.hpp
index 2ec0bcec9f1..90843c3ecb1 100644
--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,8 @@
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
-#include <migraphx/optional.hpp>
+
+#include <algorithm>
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -45,8 +46,6 @@ struct reshape
         return pack(f(self.dims, "dims"));
     }
 
-    value attributes() const { return {{"require_std_shape", true}}; }
-
     std::string name() const { return "reshape"; }
 
     shape dyn_compute_shape(shape s0) const
@@ -110,27 +109,9 @@ struct reshape
         return it;
     }
 
-    template <class DimIterator, class StrideIterator>
-    static auto can_strides_merge(DimIterator dim_start,
-                                  DimIterator dim_last,
-                                  StrideIterator stride_start,
-                                  StrideIterator stride_last)
-    {
-        assert(std::distance(dim_start, dim_last) == std::distance(stride_start, stride_last));
-        auto cstride = *std::prev(stride_last);
-        return std::equal(std::make_reverse_iterator(dim_last),
-                          std::make_reverse_iterator(dim_start + 1),
-                          std::make_reverse_iterator(stride_last - 1),
-                          std::make_reverse_iterator(stride_start),
-                          [&](auto dim, auto stride) {
-                              cstride *= dim;
-                              return stride == cstride;
-                          });
-    }
-
-    // This will reshape the dimesions of the input shape to use the lens of
-    // `rdims`. If this can't be done without changing memory layout then it
-    // will return nullopt
+    // This will attempt to alias the dimensions of the input shape to the lens of
+    // `rdims`. Unlike reshape_lazy though we can modify memory layout with copies and this
+    // can remove previous nullopts that were sent back for the alias case
     static optional<shape> reshape_dims(const shape& input, const std::vector<std::size_t>& rdims)
     {
         if(input.standard())
@@ -155,13 +136,8 @@ struct reshape
             {
                 auto start = idims.begin() + i;
                 auto it    = compute_end_dim(start, idims.end(), rdim);
-                if(it == start)
-                    return nullopt;
                 auto n = it - start;
                 assert((i + n) <= istrides.size());
-                if(not can_strides_merge(
-                       start, it + 1, istrides.begin() + i, istrides.begin() + i + n + 1))
-                    return nullopt;
                 i += n;
                 rstrides.push_back(istrides[i]);
             }
@@ -170,8 +146,7 @@ struct reshape
             {
                 auto start = rdims.begin() + i;
                 auto it    = compute_end_dim(start, rdims.end(), idim);
-                if(it == start)
-                    return nullopt;
+
                 auto n = it - start;
                 assert((r + n) <= rdims.size());
                 auto stride = istrides[i] * idim;
@@ -191,15 +166,11 @@ struct reshape
             auto stride = rstrides.back();
             for(auto d : range(rdims.begin() + rstrides.size(), rdims.end()))
             {
-                if(d != 1)
-                    return nullopt;
+                (void)d;
                 rstrides.push_back(stride);
             }
         }
 
-        if(rdims.size() != rstrides.size())
-            return nullopt;
-
         return shape{input.type(), rdims, rstrides};
     }
 
@@ -233,25 +204,24 @@ struct reshape
         }
 
         auto s = reshape_dims(inputs.front(), rdims);
-        if(not s.has_value())
-            MIGRAPHX_THROW("Reshape on axis that is not packed.");
 
         if(s->elements() != inputs.front().elements())
-            MIGRAPHX_THROW("Reshape: Wrong number of elements for reshape: reshape has " +
+            MIGRAPHX_THROW("reshape: Wrong number of elements for reshape: reshape has " +
                            std::to_string(s->elements()) + " elements whereas the input has " +
                            std::to_string(inputs.front().elements()));
 
-        assert(s->bytes() == inputs.front().bytes());
         return *s;
     }
 
     shape compute_shape(std::vector<shape> inputs) const
     {
         check_shapes{inputs, *this, true}.has(1);
+
         auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
         if(n_neg_dims > 1)
-            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");
-        auto s0 = inputs[0];
+            MIGRAPHX_THROW("reshape: Dimensions for reshape can only have one -1 dim");
+
+        auto s0 = inputs.front();
         if(s0.dynamic())
         {
             return dyn_compute_shape(s0);
@@ -264,10 +234,14 @@ struct reshape
 
     argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
     {
-        return args[0].reshape(dyn_out.computed_shape);
-    }
+        assert(dyn_out.computed_shape.standard());
+        argument result{dyn_out.computed_shape};
 
-    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
+        visit_all(result, args[0])([&](auto output, auto input) {
+            std::copy(input.begin(), input.end(), output.begin());
+        });
+        return result;
+    }
 };
 
 } // namespace op
diff --git a/src/include/migraphx/op/reshape_lazy.hpp b/src/include/migraphx/op/reshape_lazy.hpp
new file mode 100644
index 00000000000..7263e8006d4
--- /dev/null
+++ b/src/include/migraphx/op/reshape_lazy.hpp
@@ -0,0 +1,279 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_RESHAPE_LAZY_HPP
+#define MIGRAPHX_GUARD_OPERATORS_RESHAPE_LAZY_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/dyn_output.hpp>
+#include <migraphx/optional.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct reshape_lazy
+{
+    std::vector<int64_t> dims;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dims, "dims"));
+    }
+
+    value attributes() const { return {{"require_std_shape", true}}; }
+
+    std::string name() const { return "reshape_lazy"; }
+
+    shape dyn_compute_shape(shape s0) const
+    {
+        auto dyn_dims      = s0.dyn_dims();
+        auto num_not_fixed = std::count_if(
+            dyn_dims.cbegin(), dyn_dims.cend(), [](auto dd) { return not dd.is_fixed(); });
+        if(num_not_fixed != 1)
+        {
+            MIGRAPHX_THROW("reshape_lazy: Only supports one non-fixed dynamic_dimension");
+        }
+        // track number of fixed elements in input and output
+        std::size_t num_dims_ele = 1;
+        std::size_t num_dd_ele   = 1;
+        for(std::size_t i = 0; i < dyn_dims.size(); ++i)
+        {
+            if(dyn_dims[i].is_fixed())
+            {
+                num_dims_ele *= dims[i];
+                num_dd_ele *= dyn_dims[i].min;
+            }
+            else
+            {
+                if(dims[i] != 0 and dims[i] != -1)
+                {
+                    MIGRAPHX_THROW(
+                        "reshape_lazy: Non-fixed dynamic_dimension doesn't match with 0 or -1 "
+                        "output dimension");
+                }
+            }
+        }
+        if(num_dims_ele != num_dd_ele)
+        {
+            MIGRAPHX_THROW("reshape_lazy: Number of fixed elements must match. Input: " +
+                           std::to_string(num_dd_ele) + " Output: " + std::to_string(num_dims_ele));
+        }
+        // construct output dynamic shape from dims attribute
+        std::vector<shape::dynamic_dimension> output_dyn_dims(dims.size());
+        std::transform(dims.cbegin(),
+                       dims.cend(),
+                       dyn_dims.cbegin(),
+                       output_dyn_dims.begin(),
+                       [](std::size_t dim, auto dyn_dim) {
+                           if(not dyn_dim.is_fixed())
+                               return dyn_dim;
+                           return shape::dynamic_dimension{dim, dim};
+                       });
+        return {s0.type(), output_dyn_dims};
+    }
+
+    template <class Iterator>
+    static auto compute_end_dim(Iterator start, Iterator last, std::size_t dim)
+    {
+        std::size_t x = 1;
+        auto it       = std::find_if(start, last, [&](auto i) {
+            x *= i;
+            return x >= dim;
+        });
+        if(x != dim)
+            return start;
+        return it;
+    }
+
+    template <class DimIterator, class StrideIterator>
+    static auto can_strides_merge(DimIterator dim_start,
+                                  DimIterator dim_last,
+                                  StrideIterator stride_start,
+                                  StrideIterator stride_last)
+    {
+        assert(std::distance(dim_start, dim_last) == std::distance(stride_start, stride_last));
+        auto cstride = *std::prev(stride_last);
+        return std::equal(std::make_reverse_iterator(dim_last),
+                          std::make_reverse_iterator(dim_start + 1),
+                          std::make_reverse_iterator(stride_last - 1),
+                          std::make_reverse_iterator(stride_start),
+                          [&](auto dim, auto stride) {
+                              cstride *= dim;
+                              return stride == cstride;
+                          });
+    }
+
+    // This will attempt to alias the dimensions of the input shape to the lens of
+    // `rdims`. If this can't be done without changing memory layout then it
+    // will return nullopt
+    static optional<shape> reshape_lazy_dims(const shape& input,
+                                             const std::vector<std::size_t>& rdims)
+    {
+        if(input.standard())
+            return shape{input.type(), rdims};
+
+        const auto& idims    = input.lens();
+        const auto& istrides = input.strides();
+
+        std::vector<std::size_t> rstrides;
+        std::size_t i = 0;
+        std::size_t r = 0;
+        while(i < idims.size() and r < rdims.size())
+        {
+            auto idim = idims[i];
+            auto rdim = rdims[r];
+            if(rdim == idim)
+            {
+                rstrides.push_back(istrides[i]);
+            }
+            // squeeze
+            else if(rdim > idim)
+            {
+                auto start = idims.begin() + i;
+                auto it    = compute_end_dim(start, idims.end(), rdim);
+                if(it == start)
+                    return nullopt;
+                auto n = it - start;
+                assert((i + n) <= istrides.size());
+                if(not can_strides_merge(
+                       start, it + 1, istrides.begin() + i, istrides.begin() + i + n + 1))
+                    return nullopt;
+                i += n;
+                rstrides.push_back(istrides[i]);
+            }
+            // unsqueeze
+            else // if(rdim < idim)
+            {
+                auto start = rdims.begin() + i;
+                auto it    = compute_end_dim(start, rdims.end(), idim);
+                if(it == start)
+                    return nullopt;
+                auto n = it - start;
+                assert((r + n) <= rdims.size());
+                auto stride = istrides[i] * idim;
+                std::for_each(start, it + 1, [&](auto dim) {
+                    stride /= dim;
+                    rstrides.push_back(stride);
+                });
+                r += n;
+            }
+            i++;
+            r++;
+        }
+
+        // Handle trailing 1s
+        if(rstrides.size() < rdims.size() and not rstrides.empty())
+        {
+            auto stride = rstrides.back();
+            for(auto d : range(rdims.begin() + rstrides.size(), rdims.end()))
+            {
+                if(d != 1)
+                    return nullopt;
+                rstrides.push_back(stride);
+            }
+        }
+
+        if(rdims.size() != rstrides.size())
+            return nullopt;
+
+        return shape{input.type(), rdims, rstrides};
+    }
+
+    shape static_compute_shape(std::vector<shape> inputs, std::size_t n_neg_dims) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto&& idims = inputs.front().lens();
+        std::vector<std::size_t> rdims(dims.begin(), dims.end());
+
+        for(std::size_t i = 0; i < dims.size(); i++)
+        {
+            if(dims[i] == 0)
+                rdims[i] = idims[i];
+
+            // since rdims using size_t type, -1 is the max value
+            // is size_t that cause later compuation incorrect
+            if(dims[i] == -1)
+                rdims[i] = 1;
+        }
+
+        if(n_neg_dims > 0)
+        {
+            size_t missing_dim =
+                inputs.front().elements() /
+                std::accumulate(rdims.begin(), rdims.end(), 1, std::multiplies<int64_t>());
+            for(std::size_t i = 0; i < rdims.size(); i++)
+            {
+                if(dims[i] == -1)
+                    rdims[i] = missing_dim;
+            }
+        }
+
+        auto s = reshape_lazy_dims(inputs.front(), rdims);
+        if(not s.has_value())
+            MIGRAPHX_THROW("reshape_lazy on axis that is not packed.");
+
+        if(s->elements() != inputs.front().elements())
+            MIGRAPHX_THROW(
+                "reshape_lazy: Wrong number of elements for reshape_lazy: reshape_lazy has " +
+                std::to_string(s->elements()) + " elements whereas the input has " +
+                std::to_string(inputs.front().elements()));
+
+        assert(s->bytes() == inputs.front().bytes());
+        return *s;
+    }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
+        if(n_neg_dims > 1)
+            MIGRAPHX_THROW("reshape_lazy: Dimensions for reshape_lazy can only have one -1 dim");
+        auto s0 = inputs[0];
+        if(s0.dynamic())
+        {
+            return dyn_compute_shape(s0);
+        }
+        else
+        {
+            return static_compute_shape(inputs, n_neg_dims);
+        }
+    }
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        return args[0].reshape(dyn_out.computed_shape);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/src/rewrite_pooling.cpp b/src/rewrite_pooling.cpp
index d4b2fa8a2f4..b381a8a73f9 100644
--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -43,9 +43,7 @@ void rewrite_pooling::apply(module& m) const
             continue;
         if(ins->inputs().empty())
             continue;
-        auto&& s = ins->inputs().front()->get_shape();
-        if(not s.standard())
-            continue;
+        auto&& s  = ins->inputs().front()->get_shape();
         auto&& op = any_cast<op::pooling>(ins->get_operator());
         if(not std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
             continue;
@@ -54,27 +52,18 @@ void rewrite_pooling::apply(module& m) const
         auto lens = s.lens();
         if(not std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
             continue;
-        std::int64_t n = s.lens()[0];
-        std::int64_t c = s.lens()[1];
-        auto reshape   = m.insert_instruction(
-            ins, make_op("reshape", {{"dims", {n * c, -1}}}), ins->inputs().front());
-        instruction_ref pooling{};
-
+        std::vector<std::int64_t> axes(lens.size() - 2);
+        std::iota(axes.begin(), axes.end(), 2);
         // average pooling
         if(op.mode == op::pooling_mode::average)
         {
-            pooling = m.insert_instruction(ins, make_op("reduce_mean", {{"axes", {1}}}), reshape);
+            m.replace_instruction(ins, make_op("reduce_mean", {{"axes", axes}}), ins->inputs());
         }
         // max pooling
         else
         {
-            pooling = m.insert_instruction(ins, make_op("reduce_max", {{"axes", {1}}}), reshape);
+            m.replace_instruction(ins, make_op("reduce_max", {{"axes", axes}}), ins->inputs());
         }
-
-        std::vector<int64_t> rsp_lens(lens.size(), 1);
-        rsp_lens[0] = n;
-        rsp_lens[1] = c;
-        m.replace_instruction(ins, make_op("reshape", {{"dims", rsp_lens}}), pooling);
     }
 }
 
diff --git a/src/simplify_reshapes.cpp b/src/simplify_reshapes.cpp
index 08ea498e720..cd27b1157f2 100644
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -122,6 +122,11 @@ struct find_nop_reshapes
         reshapes.insert("pad");
         reshapes.insert("slice");
         reshapes.insert("transpose");
+        reshapes.insert("reduce_mean");
+        reshapes.insert("reduce_max");
+        reshapes.insert("reduce_min");
+        reshapes.insert("reduce_sum");
+        reshapes.insert("reduce_prod");
         return match::name(reshapes)(match::same_shape(match::arg(0)));
     }
 
diff --git a/src/targets/gpu/fuse_mlir.cpp b/src/targets/gpu/fuse_mlir.cpp
index f2f2ccc8015..e40b31ddd1d 100644
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -327,12 +327,12 @@ struct find_mlir_standalone_op
 
 struct find_mlir_standalone_convolution_op : find_mlir_standalone_op
 {
-    auto matcher() const { return match::name("convolution"); }
+    auto matcher() const { return is_mlir_conv; }
 };
 
 struct find_mlir_standalone_dot_op : find_mlir_standalone_op
 {
-    auto matcher() const { return match::name("dot"); }
+    auto matcher() const { return match::any_of(match::name("dot"), match::name("quant_dot")); }
 };
 
 /**
@@ -365,7 +365,7 @@ bool is_enabled(std::string_view op_name, context* ctx)
         {
             return true;
         }
-        else if(op_name == "convolution")
+        else if(op_name == "convolution" or op_name == "quant_convolution")
         {
             if(ctx == nullptr)
             {
diff --git a/src/targets/gpu/fuse_ops.cpp b/src/targets/gpu/fuse_ops.cpp
index d0155fa7707..21ea9498c7c 100644
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -790,22 +790,26 @@ struct find_layernorm_pointwise
 {
     auto matcher() const
     {
-        return precompile_name("pointwise")(match::arg(0)(
+        return precompile_name("pointwise")(match::any_of[match::inputs()](
             precompile_name("gpu::prelayernorm", "gpu::preadd_layernorm").bind("layernorm")));
     }
 
     void apply(module& m, const match::matcher_result& r) const
     {
-        auto ins       = r.result;
+        auto pw_ins    = r.result;
         auto layernorm = r.instructions["layernorm"];
         if(not layernorm->module_inputs().empty())
             return;
-        auto* pm    = ins->module_inputs().front();
+        auto* pm       = pw_ins->module_inputs().front();
+        auto pw_inputs = pw_ins->inputs();
+        auto ln_pos    = std::find(pw_inputs.begin(), pw_inputs.end(), layernorm);
+        assert(ln_pos != pw_inputs.end());
+        pw_inputs.erase(ln_pos);
         auto inputs = layernorm->inputs();
         inputs.pop_back();
-        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+        inputs.insert(inputs.end(), pw_inputs.begin(), pw_inputs.end());
 
-        m.replace_instruction(ins, layernorm->get_operator(), inputs, {pm});
+        m.replace_instruction(pw_ins, layernorm->get_operator(), inputs, {pm});
     }
 };
 
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 5a35464ec76..d0357f06c63 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -40,6 +40,7 @@
 #include <migraphx/op/if_op.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/reshape_lazy.hpp>
 
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/lowering.hpp>
@@ -89,7 +90,6 @@ struct miopen_apply
         offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
 
         add_generic_op("contiguous");
-
         add_extend_op("argmax");
         add_extend_op("argmin");
         add_extend_op("logsoftmax");
@@ -115,6 +115,7 @@ struct miopen_apply
         add_neg_op();
         add_nms_op();
         add_select_module_op();
+        add_reshape_lazy_op();
     }
 
     void copy_params() const
@@ -376,6 +377,32 @@ struct miopen_apply
             return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
         });
     }
+
+    /**
+     *  Adds reshape lazy to reshape ops that can be aliased instead of copied.
+     *  `gpu::contiguous` are added before and after the reshape; these contiguous
+     *  instructions can be removed by the eliminate_contiguous pass.
+     */
+    void add_reshape_lazy_op()
+    {
+        apply_map.emplace("reshape", [=](instruction_ref ins) {
+            std::vector<instruction_ref> before_contiguous_args = ins->inputs();
+            auto before_alloc = insert_allocation(ins, std::prev(ins)->get_shape());
+            before_contiguous_args.push_back(before_alloc);
+            auto before_contig =
+                mod->insert_instruction(ins, make_op("gpu::contiguous"), {before_contiguous_args});
+
+            auto new_lazy_reshape = mod->insert_instruction(
+                ins,
+                make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value().at("dims")}}}),
+                before_contig);
+
+            std::vector<instruction_ref> after_contiguous_args = {new_lazy_reshape};
+            auto after_alloc = insert_allocation(new_lazy_reshape, new_lazy_reshape->get_shape());
+            after_contiguous_args.push_back(after_alloc);
+            return mod->replace_instruction(ins, make_op("gpu::contiguous"), after_contiguous_args);
+        });
+    }
 };
 
 void lowering::apply(module_pass_manager& mpm) const
diff --git a/src/targets/gpu/mlir.cpp b/src/targets/gpu/mlir.cpp
index 94b3761d0b6..347f93333ea 100644
--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -24,6 +24,7 @@
 #include "migraphx/make_op.hpp"
 #include <migraphx/stringutils.hpp>
 #include <migraphx/gpu/mlir.hpp>
+#include <ostream>
 
 #ifdef MIGRAPHX_MLIR
 #include <mlir-c/IR.h>
@@ -34,6 +35,7 @@
 #include <mlir-c/Dialect/Rock.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
+#include <mlir-c/Support.h>
 #include <mutex>
 #if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
 #warning "Incompatible version of rocMLIR library used, disabling"
@@ -180,13 +182,85 @@ std::string mlir_print(F f, T x)
     return ss.str();
 }
 
+struct mlir_logger
+{
+    std::stringstream ss;
+    mlir_context* ctx;
+    std::optional<MlirDiagnosticHandlerID> id;
+
+    mlir_logger() : ctx(nullptr), id(std::nullopt) {}
+
+    mlir_logger(mlir_context* context) : ctx(context)
+    {
+        id =
+            mlirContextAttachDiagnosticHandler(ctx->get(), mlir_diagnostic_print_cb, this, nullptr);
+    }
+
+    ~mlir_logger()
+    {
+        if(id.has_value())
+            mlirContextDetachDiagnosticHandler(ctx->get(), *id);
+    }
+
+    mlir_logger(const mlir_logger& other)            = delete;
+    mlir_logger& operator=(const mlir_logger& other) = delete;
+
+    mlir_logger(mlir_logger&& other) noexcept
+        : ss(std::move(other.ss)), ctx(other.ctx), id(other.id)
+    {
+        other.ctx = nullptr;
+        other.id  = std::nullopt;
+    }
+
+    mlir_logger& operator=(mlir_logger other) noexcept
+    {
+        std::swap(ss, other.ss);
+        std::swap(ctx, other.ctx);
+        std::swap(id, other.id);
+        return *this;
+    }
+
+    std::string str() const { return ss.str(); }
+
+    void clear() { ss = std::stringstream{}; }
+
+    static MlirLogicalResult mlir_diagnostic_print_cb(MlirDiagnostic diag, void* logger);
+
+    MlirLogicalResult handle(MlirDiagnostic diag);
+};
+
+MlirLogicalResult mlir_logger::mlir_diagnostic_print_cb(MlirDiagnostic diag, void* logger)
+{
+    return reinterpret_cast<mlir_logger*>(logger)->handle(diag);
+}
+
+MlirLogicalResult mlir_logger::handle(MlirDiagnostic diag)
+{
+    MlirDiagnosticSeverity sev = mlirDiagnosticGetSeverity(diag);
+    switch(sev)
+    {
+    case MlirDiagnosticSeverity::MlirDiagnosticError: ss << "Error: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticWarning: ss << "Warning: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticNote: ss << "Note: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticRemark: ss << "Remark: "; break;
+    }
+    mlir_print(mlirDiagnosticPrint, diag, [&](auto s) { ss << s; });
+    ss << std::endl;
+    for(intptr_t i = 0, e = mlirDiagnosticGetNumNotes(diag); i < e; ++i)
+    {
+        (void)handle(mlirDiagnosticGetNote(diag, i));
+    }
+    return mlirLogicalResultSuccess();
+}
+
 struct mlir_program
 {
     mlir_program()
         : ctx(mlirContextCreateWithRegistry(get_dialect_registry().get(),
                                             /*threadingEnable=*/false)),
           location(mlirLocationUnknownGet(ctx.get())),
-          mmodule(mlirModuleCreateEmpty(location))
+          mmodule(mlirModuleCreateEmpty(location)),
+          logger(&ctx)
     {
         mlirContextSetThreadPool(ctx.get(), get_thread_pool().get());
         mlirContextLoadAllAvailableDialects(ctx.get());
@@ -614,21 +688,49 @@ struct mlir_program
         }
     }
 
-    void run_high_level_pipeline() MIGRAPHX_TIDY_CONST
+    void run_high_level_pipeline()
     {
         mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
         mlirMIGraphXAddHighLevelPipeline(pm_front.get());
-        mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()));
+        logger.clear();
+        if(mlirLogicalResultIsFailure(
+               mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()))))
+        {
+            std::string error = "Invalid MLIR created: " + logger.str();
+            if(enabled(MIGRAPHX_TRACE_MLIR{}))
+            {
+                std::cout << error << std::endl;
+            }
+            MIGRAPHX_THROW(error);
+        }
     }
 
-    void run_backend_pipeline() MIGRAPHX_TIDY_CONST
+    void run_backend_pipeline()
     {
         mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
         mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
-        mlirPassManagerRunOnOp(pm_back.get(), mlirModuleGetOperation(mmodule.get()));
+        logger.clear();
+        const size_t trace = value_of(MIGRAPHX_TRACE_MLIR{});
+        static std::mutex mutex;
+        auto mod_op = mlirModuleGetOperation(mmodule.get());
+        if(trace >= 2)
+        {
+            const std::lock_guard<std::mutex> lock(mutex);
+            std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+        }
+
+        if(mlirLogicalResultIsFailure(mlirPassManagerRunOnOp(pm_back.get(), mod_op)))
+        {
+            std::string error = "MLIR backend compilation failed: " + logger.str();
+            if(enabled(MIGRAPHX_TRACE_MLIR{}))
+            {
+                std::cout << error << std::endl;
+            }
+            MIGRAPHX_THROW(error);
+        }
     }
 
-    code_object_op compile(const value& solution) MIGRAPHX_TIDY_CONST
+    code_object_op compile(const value& solution)
     {
         // 1st pipeline to call
         run_high_level_pipeline();
@@ -682,7 +784,7 @@ struct mlir_program
             MIGRAPHX_THROW("Failed setting tuning key: " + *str);
     }
 
-    tuning_config get_tuning_config(bool exhaustive) MIGRAPHX_TIDY_CONST
+    tuning_config get_tuning_config(bool exhaustive)
     {
         tuning_config tc;
         run_high_level_pipeline();
@@ -702,7 +804,8 @@ struct mlir_program
             if(perf_key_bytes > perf_key.size())
                 MIGRAPHX_THROW("Tuning perf key was " + std::to_string(perf_key_bytes) +
                                " bytes and thus too long");
-            tc.solutions.emplace_back(perf_key.begin(), perf_key.begin() + perf_key_bytes);
+            tc.solutions.emplace_back(
+                std::string(perf_key.begin(), perf_key.begin() + perf_key_bytes));
         }
         std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> tuning_key;
         size_t tuning_key_bytes =
@@ -809,6 +912,7 @@ struct mlir_program
     mlir_context ctx;
     MlirLocation location;
     mlir_module mmodule;
+    mlir_logger logger;
     problem_params pp;
     std::deque<std::string> strings{};
     std::string target_arch = "";
diff --git a/test/auto_contiguous_test.cpp b/test/auto_contiguous_test.cpp
index a70e2ec1d0e..b3f401dd9d1 100644
--- a/test/auto_contiguous_test.cpp
+++ b/test/auto_contiguous_test.cpp
@@ -158,6 +158,31 @@ TEST_CASE(two_transpose_gather)
     EXPECT(m1 == m2);
 }
 
+TEST_CASE(standard_reshape_lazy)
+{
+    migraphx::module m1;
+    {
+        auto data = m1.add_parameter("2x2", {migraphx::shape::float_type, {2, 3, 4, 5}});
+        auto add  = m1.add_instruction(migraphx::make_op("add"), data, data);
+        auto r =
+            m1.add_instruction(migraphx::make_op("reshape_lazy", {{"dims", {2, 1, 12, 5}}}), add);
+        m1.add_return({r});
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto data = m2.add_parameter("2x2", {migraphx::shape::float_type, {2, 3, 4, 5}});
+        auto add  = m2.add_instruction(migraphx::make_op("add"), data, data);
+        auto ca   = m2.add_instruction(migraphx::make_op("contiguous"), add);
+        auto r =
+            m2.add_instruction(migraphx::make_op("reshape_lazy", {{"dims", {2, 1, 12, 5}}}), ca);
+        m2.add_return({r});
+    }
+
+    EXPECT(m1 == m2);
+}
+
 TEST_CASE(standard_reshape)
 {
     migraphx::module m1;
@@ -173,8 +198,7 @@ TEST_CASE(standard_reshape)
     {
         auto data = m2.add_parameter("2x2", {migraphx::shape::float_type, {2, 3, 4, 5}});
         auto add  = m2.add_instruction(migraphx::make_op("add"), data, data);
-        auto ca   = m2.add_instruction(migraphx::make_op("contiguous"), add);
-        auto r    = m2.add_instruction(migraphx::make_op("reshape", {{"dims", {2, 1, 12, 5}}}), ca);
+        auto r = m2.add_instruction(migraphx::make_op("reshape", {{"dims", {2, 1, 12, 5}}}), add);
         m2.add_return({r});
     }
 
diff --git a/test/gpu/fuse_ops.cpp b/test/gpu/fuse_ops.cpp
new file mode 100644
index 00000000000..42b3db25526
--- /dev/null
+++ b/test/gpu/fuse_ops.cpp
@@ -0,0 +1,107 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "make_precompile_op.hpp"
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/make_op.hpp>
+#include <test.hpp>
+#include <pointwise.hpp>
+
+void run_pass(migraphx::program& p)
+{
+    migraphx::run_passes(p, {migraphx::gpu::fuse_ops{}, migraphx::dead_code_elimination{}});
+}
+
+TEST_CASE(layernorm_pointwise)
+{
+    migraphx::shape s{migraphx::shape::float_type, {2, 3, 4}};
+    auto create_program = [=](bool first_arg_layernorm) {
+        migraphx::program p;
+        auto* mm       = p.get_main_module();
+        auto x         = mm->add_parameter("x", s);
+        auto y         = mm->add_parameter("y", s);
+        auto z         = mm->add_parameter("z", s);
+        auto alloc     = migraphx::make_op("allocate", {{"shape", to_value(s)}});
+        auto alloc_ins = mm->add_instruction(alloc);
+        auto* pw_add1 =
+            create_pointwise_module(p, "main:pointwise0", {x, y}, single_pointwise("add"));
+        auto add1 =
+            mm->add_instruction(make_precompile_op("pointwise"), {x, y, alloc_ins}, {pw_add1});
+        auto alloc_ins2 = mm->add_instruction(alloc);
+        auto layernorm_ins =
+            mm->add_instruction(make_precompile_op("gpu::prelayernorm"), add1, alloc_ins2);
+        std::vector<migraphx::instruction_ref> pw_inputs = {layernorm_ins, z};
+        if(not first_arg_layernorm)
+        {
+            pw_inputs = {z, layernorm_ins};
+        }
+        auto* pw_add2 =
+            create_pointwise_module(p, "main:pointwise1", pw_inputs, single_pointwise("add"));
+        auto alloc_ins3 = mm->add_instruction(alloc);
+        pw_inputs.push_back(alloc_ins3);
+        auto add2 = mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add2});
+        mm->add_return({add2});
+        return p;
+    };
+
+    auto create_fused_program = [=]() {
+        migraphx::program p;
+        auto* mm       = p.get_main_module();
+        auto x         = mm->add_parameter("x", s);
+        auto y         = mm->add_parameter("y", s);
+        auto z         = mm->add_parameter("z", s);
+        auto alloc     = migraphx::make_op("allocate", {{"shape", to_value(s)}});
+        auto alloc_ins = mm->add_instruction(alloc);
+        auto* pw_add1 =
+            create_pointwise_module(p, "main:pointwise0", {x, y}, single_pointwise("add"));
+        auto add1 =
+            mm->add_instruction(make_precompile_op("pointwise"), {x, y, alloc_ins}, {pw_add1});
+        auto alloc_ins2 = mm->add_instruction(alloc);
+        auto* pw_add2 =
+            create_pointwise_module(p, "main:pointwise1", {x, z}, single_pointwise("add"));
+        auto layernorm_ins = mm->add_instruction(
+            make_precompile_op("gpu::prelayernorm"), {add1, z, alloc_ins2}, {pw_add2});
+        mm->add_return({layernorm_ins});
+        return p;
+    };
+
+    {
+        migraphx::program p1 = create_program(true);
+        run_pass(p1);
+        migraphx::program p2 = create_fused_program();
+        EXPECT(p1 == p2);
+    }
+    {
+        migraphx::program p1 = create_program(false);
+        run_pass(p1);
+        migraphx::program p2 = create_fused_program();
+        EXPECT(p1 == p2);
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
diff --git a/test/include/pointwise.hpp b/test/include/pointwise.hpp
index 03892d60acd..bd5316939c0 100644
--- a/test/include/pointwise.hpp
+++ b/test/include/pointwise.hpp
@@ -24,16 +24,16 @@
 #ifndef MIGRAPHX_GUARD_TEST_INCLUDE_POINTWISE_HPP
 #define MIGRAPHX_GUARD_TEST_INCLUDE_POINTWISE_HPP
 
+#include <migraphx/instruction_ref.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/make_op.hpp>
 
 template <class F>
-migraphx::instruction_ref add_pointwise(migraphx::program& p,
-                                        migraphx::module_ref mm,
-                                        const std::string& name,
-                                        std::vector<migraphx::instruction_ref> inputs,
-                                        F f)
+migraphx::module_ref create_pointwise_module(migraphx::program& p,
+                                             const std::string& name,
+                                             std::vector<migraphx::instruction_ref> inputs,
+                                             F f)
 {
     auto* pm = p.create_module(name);
     pm->set_bypass();
@@ -44,6 +44,17 @@ migraphx::instruction_ref add_pointwise(migraphx::program& p,
     });
     auto r = f(pm, params);
     pm->add_return({r});
+    return pm;
+}
+
+template <class F>
+migraphx::instruction_ref add_pointwise(migraphx::program& p,
+                                        migraphx::module_ref mm,
+                                        const std::string& name,
+                                        std::vector<migraphx::instruction_ref> inputs,
+                                        F f)
+{
+    auto* pm = create_pointwise_module(p, name, inputs, f);
     return mm->add_instruction(migraphx::make_op("pointwise"), inputs, {pm});
 }
 
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index 0d649690665..ae0abd027e5 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -2524,13 +2524,21 @@ TEST_CASE(reshape_shape)
         migraphx::shape output{migraphx::shape::float_type, lens};
         expect_shape(output, migraphx::make_op("reshape", {{"dims", new_shape}}), input);
     }
+}
 
+TEST_CASE(reshape_shape_invalid)
+{
+    migraphx::shape input{migraphx::shape::float_type, {24, 1, 1, 1}};
     for(auto&& new_shape :
         std::vector<std::vector<int64_t>>{{8, 3, 2, 2}, {1, 3, -1, -1}, {3, 0}, {3, 2}})
     {
         throws_shape(migraphx::make_op("reshape", {{"dims", new_shape}}), input);
     }
+}
 
+TEST_CASE(reshape_shape_minus1_reshapes)
+{
+    migraphx::shape input{migraphx::shape::float_type, {24, 1, 1, 1}};
     std::vector<std::pair<std::vector<int64_t>, migraphx::shape>> minus1_tests{
         {{2, -1, 3}, {migraphx::shape::float_type, {2, 4, 3}}},
         {{0, -1, 0}, {migraphx::shape::float_type, {24, 1, 1}}},
@@ -2654,11 +2662,11 @@ TEST_CASE(reshape_broadcast_squeeze)
     expect_shape(output, migraphx::make_op("reshape", {{"dims", output.lens()}}), input);
 }
 
-TEST_CASE(reshape_broadcast_squeeze_error)
+TEST_CASE(reshape_broadcast_squeeze_memlayout_change)
 {
     migraphx::shape input{migraphx::shape::float_type, {2, 16, 16, 1280}, {0, 0, 0, 1}};
-    std::vector<int64_t> new_shape = {2, 16, 20480};
-    throws_shape(migraphx::make_op("reshape", {{"dims", new_shape}}), input);
+    migraphx::shape output{migraphx::shape::float_type, {2, 16, 256, 80}, {0, 0, 0, 16}};
+    expect_shape(output, migraphx::make_op("reshape", {{"dims", output.lens()}}), input);
 }
 
 TEST_CASE(reshape_dyn_shape)
@@ -2706,6 +2714,199 @@ TEST_CASE(reshape_non_fixed_not_matching_error)
     throws_shape(migraphx::make_op("reshape", {{"dims", new_shape}}), input);
 }
 
+TEST_CASE(reshape_lazy_shape)
+{
+    migraphx::shape input{migraphx::shape::float_type, {24, 1, 1, 1}};
+    for(auto&& new_shape :
+        std::vector<std::vector<int64_t>>{{8, 3, 1, 1}, {1, 3, 4, 2}, {1, 3, 4, 2}})
+    {
+        std::vector<std::size_t> lens(new_shape.size());
+        std::copy(new_shape.begin(), new_shape.end(), lens.begin());
+        migraphx::shape output{migraphx::shape::float_type, lens};
+        expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+    }
+
+    for(auto&& new_shape :
+        std::vector<std::vector<int64_t>>{{8, 3, 2, 2}, {1, 3, -1, -1}, {3, 0}, {3, 2}})
+    {
+        throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+    }
+
+    std::vector<std::pair<std::vector<int64_t>, migraphx::shape>> minus1_tests{
+        {{2, -1, 3}, {migraphx::shape::float_type, {2, 4, 3}}},
+        {{0, -1, 0}, {migraphx::shape::float_type, {24, 1, 1}}},
+        {{2, -1, 0}, {migraphx::shape::float_type, {2, 12, 1}}},
+        {{0, 0, -1}, {migraphx::shape::float_type, {24, 1, 1}}},
+        {{2, 0, -1}, {migraphx::shape::float_type, {2, 1, 12}}},
+        {{-1, 2, 3}, {migraphx::shape::float_type, {4, 2, 3}}},
+        {{-1, 0, 3}, {migraphx::shape::float_type, {8, 1, 3}}},
+        {{-1, 0, 0}, {migraphx::shape::float_type, {24, 1, 1}}},
+        {{-1, 3, 0}, {migraphx::shape::float_type, {8, 3, 1}}}};
+
+    for(auto& it : minus1_tests)
+    {
+        expect_shape(it.second, migraphx::make_op("reshape_lazy", {{"dims", it.first}}), input);
+    }
+}
+
+// This uses the permutation to compute the reshape_lazy since its simpler than
+// trying to calculate strides. As we collapse or expand dimensions, we
+// remove the collapsed dimensions or duplicate the expanded dimensions in
+// the permutation. Then we renumber the permutation. So for dimensions of 4,
+// 24, 1, 1, 1 with a permutation of 1, 0, 2, 3, 4 that reshape_lazys to 4, 1, 3,
+// 4, 2, we first remove the collapsed dimensions or duplicate the expanded
+// dimensions which gives 1, 0, 0, 0, 0. Then after renumbering we get a
+// final permutation of 4, 0, 1, 2, 3.
+TEST_CASE(reshape_lazy_nonstandard)
+{
+    auto input = migraphx::shape::from_permutation(migraphx::shape::float_type,
+                                                   {4, 24, 1, 1, 1},
+                                                   migraphx::invert_permutation({1, 0, 2, 3, 4}));
+    std::vector<std::pair<std::vector<std::size_t>, std::vector<int64_t>>> tests{
+        {{4, 24}, {1, 0}},
+        {{4, 24, 1, 1, 1, 1}, {1, 0, 2, 3, 4, 5}},
+        {{4, 8, 3, 1, 1}, {2, 0, 1, 3, 4}},
+        {{4, 1, 3, 4, 2}, {4, 0, 1, 2, 3}},
+        {{4, 1, 4, 3, 2}, {4, 0, 1, 2, 3}},
+        {{4, 2, 4, 3}, {3, 0, 1, 2}},
+        {{4, 2, 12, 1}, {2, 0, 1, 3}},
+        {{4, 2, 1, 12}, {3, 0, 1, 2}},
+        {{4, 4, 2, 3}, {3, 0, 1, 2}},
+        {{4, 8, 1, 3}, {3, 0, 1, 2}},
+        {{4, 8, 3, 1}, {2, 0, 1, 3}}};
+
+    for(const auto& [dims, perm] : tests)
+    {
+        migraphx::shape output = migraphx::shape::from_permutation(
+            migraphx::shape::float_type, dims, migraphx::invert_permutation(perm));
+        expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", dims}}), input);
+    }
+}
+
+TEST_CASE(reshape_lazy_nonstandard_squeeze)
+{
+    auto input = migraphx::shape::from_permutation(
+        migraphx::shape::float_type, {2, 16, 16, 1280}, migraphx::invert_permutation({0, 2, 3, 1}));
+    std::vector<std::size_t> lens = {2, 256, 1280};
+    migraphx::shape output        = migraphx::shape::from_permutation(
+        migraphx::shape::float_type, lens, migraphx::invert_permutation({0, 2, 1}));
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", lens}}), input);
+}
+
+TEST_CASE(reshape_lazy_nonstandard_error)
+{
+    auto input = migraphx::shape::from_permutation(migraphx::shape::float_type,
+                                                   {4, 24, 1, 1, 1},
+                                                   migraphx::invert_permutation({1, 0, 2, 3, 4}));
+    for(auto&& new_shape : std::vector<std::vector<int64_t>>{{4, 8, 3, 2, 2},
+                                                             {1},
+                                                             {4, 8, 4},
+                                                             {4, 24, 1, 1, 1, 1, 2},
+                                                             {8, 4, 4},
+                                                             {4, 1, 3, -1, -1},
+                                                             {4, 3, 0},
+                                                             {4, 3, 2},
+                                                             {3, 0},
+                                                             {3, 2}})
+    {
+        throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+    }
+}
+
+TEST_CASE(reshape_lazy_nonpacked_unsqueeze1)
+{
+    migraphx::shape input{migraphx::shape::float_type, {4, 16}, {32, 2}};
+    migraphx::shape output{migraphx::shape::float_type, {4, 2, 8}, {32, 16, 2}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_nonpacked_unsqueeze2)
+{
+    migraphx::shape input{migraphx::shape::float_type, {4, 16}, {32, 2}};
+    migraphx::shape output{migraphx::shape::float_type, {2, 2, 16}, {64, 32, 2}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_nonpacked_squeeze)
+{
+    migraphx::shape input{migraphx::shape::float_type, {4, 16}, {32, 2}};
+    migraphx::shape output{migraphx::shape::float_type, {64}, {2}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_broadcast_unsqueeze1)
+{
+    migraphx::shape input{migraphx::shape::float_type, {2, 256, 1280}, {0, 0, 1}};
+    migraphx::shape output{migraphx::shape::float_type, {2, 16, 16, 1280}, {0, 0, 0, 1}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_broadcast_unsqueeze2)
+{
+    migraphx::shape input{migraphx::shape::float_type, {2, 256, 1280}, {0, 0, 1}};
+    migraphx::shape output{migraphx::shape::float_type, {2, 256, 16, 80}, {0, 0, 80, 1}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_broadcast_squeeze)
+{
+    migraphx::shape input{migraphx::shape::float_type, {2, 16, 16, 1280}, {0, 0, 0, 1}};
+    migraphx::shape output{migraphx::shape::float_type, {2, 256, 1280}, {0, 0, 1}};
+    expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", output.lens()}}), input);
+}
+
+TEST_CASE(reshape_lazy_broadcast_squeeze_error)
+{
+    migraphx::shape input{migraphx::shape::float_type, {2, 16, 16, 1280}, {0, 0, 0, 1}};
+    std::vector<int64_t> new_shape = {2, 16, 20480};
+    throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+}
+
+TEST_CASE(reshape_lazy_dyn_shape)
+{
+    migraphx::shape input{migraphx::shape::float_type, {{1, 4}, {24, 24}, {1, 1}, {1, 1}}};
+    for(auto&& new_shape : std::vector<std::vector<int64_t>>{
+            {-1, 1, 1, 24}, {0, 8, 3, 1}, {-1, 3, 4, 2}, {0, 2, 4, 3}})
+    {
+        std::vector<migraphx::shape::dynamic_dimension> out_dyn_dims{};
+        for(std::size_t i = 0; i < new_shape.size(); ++i)
+        {
+            if(new_shape[i] == 0 or new_shape[i] == -1)
+            {
+                out_dyn_dims.push_back(input.dyn_dims().at(i));
+            }
+            else
+            {
+                std::size_t d = new_shape[i];
+                out_dyn_dims.push_back({d, d});
+            }
+        }
+        migraphx::shape output{migraphx::shape::float_type, out_dyn_dims};
+        expect_shape(output, migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+    }
+}
+
+TEST_CASE(reshape_lazy_multiple_non_fixed_error)
+{
+    migraphx::shape input{migraphx::shape::float_type, {{1, 4}, {24, 24}, {10, 20}, {1, 1}}};
+    std::vector<int64_t> new_shape = {0, 1, 0, 24};
+    throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+}
+
+TEST_CASE(reshape_lazy_fixed_ele_not_matching_error)
+{
+    migraphx::shape input{migraphx::shape::float_type, {{1, 4}, {24, 24}, {10, 10}, {1, 1}}};
+    std::vector<int64_t> new_shape = {0, 1, 5, 24};
+    throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+}
+
+TEST_CASE(reshape_lazy_non_fixed_not_matching_error)
+{
+    migraphx::shape input{migraphx::shape::float_type, {{1, 4}, {24, 24}, {1, 1}, {1, 1}}};
+    std::vector<int64_t> new_shape = {2, 1, 1, 24};
+    throws_shape(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+}
+
 TEST_CASE(return_shape_tuple)
 {
     using migraphx::shape;
diff --git a/test/ref/reshape.cpp b/test/ref/reshape.cpp
index f18a0063f09..440fb8cfe96 100644
--- a/test/ref/reshape.cpp
+++ b/test/ref/reshape.cpp
@@ -30,6 +30,78 @@
 
 #include <test.hpp>
 
+TEST_CASE(reshape_lazy_test0)
+{
+    migraphx::shape a_shape{migraphx::shape::float_type, {24, 1, 1, 1}};
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), -3);
+    migraphx::program p;
+    auto* mm                       = p.get_main_module();
+    auto l                         = mm->add_literal(migraphx::literal{a_shape, data});
+    std::vector<int64_t> new_shape = {8, 3, 1, 1};
+    mm->add_instruction(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), l);
+    p.compile(migraphx::make_target("ref"));
+    auto result = p.eval({}).back();
+    std::vector<float> results_vector{};
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify::verify_range(results_vector, data));
+}
+
+TEST_CASE(reshape_lazy_test1)
+{
+    migraphx::shape a_shape{migraphx::shape::float_type, {24, 1, 1, 1}};
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), -3);
+    migraphx::program p;
+    auto* mm                       = p.get_main_module();
+    auto l                         = mm->add_literal(migraphx::literal{a_shape, data});
+    std::vector<int64_t> new_shape = {1, 3, 4, 2};
+    mm->add_instruction(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), l);
+    p.compile(migraphx::make_target("ref"));
+    auto result = p.eval({}).back();
+    std::vector<float> results_vector{};
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify::verify_range(results_vector, data));
+}
+
+TEST_CASE(reshape_lazy_test2)
+{
+    migraphx::shape a_shape{migraphx::shape::float_type, {24, 1, 1, 1}};
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), -3);
+    migraphx::program p;
+    auto* mm                       = p.get_main_module();
+    auto l                         = mm->add_literal(migraphx::literal{a_shape, data});
+    std::vector<int64_t> new_shape = {1, 2, 3, 4};
+    mm->add_instruction(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), l);
+    p.compile(migraphx::make_target("ref"));
+    auto result = p.eval({}).back();
+    std::vector<float> results_vector{};
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify::verify_range(results_vector, data));
+}
+
+TEST_CASE(reshape_lazy_dyn_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4}, {24, 24}, {1, 1}, {1, 1}}};
+    std::vector<int64_t> new_shape = {0, 8, 3, 1};
+    auto input                     = mm->add_parameter("X", s);
+    mm->add_instruction(migraphx::make_op("reshape_lazy", {{"dims", new_shape}}), input);
+    p.compile(migraphx::make_target("ref"));
+
+    std::vector<float> data(48);
+    std::iota(data.begin(), data.end(), -3);
+    migraphx::parameter_map params;
+    migraphx::shape input_fixed_shape{migraphx::shape::float_type, {2, 24, 1, 1}};
+    params["X"] = migraphx::argument(input_fixed_shape, data.data());
+    auto result = p.eval(params).back();
+    std::vector<float> results_vector{};
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify::verify_range(results_vector, data));
+}
+
 TEST_CASE(reshape_test0)
 {
     migraphx::shape a_shape{migraphx::shape::float_type, {24, 1, 1, 1}};
diff --git a/test/rewrite_pooling_test.cpp b/test/rewrite_pooling_test.cpp
index f6d85d9679b..802b26e9f98 100644
--- a/test/rewrite_pooling_test.cpp
+++ b/test/rewrite_pooling_test.cpp
@@ -62,11 +62,8 @@ TEST_CASE(rewrite_pooling_test)
     auto opt_program = [&](const migraphx::operation& reduce_op) {
         migraphx::module m;
         auto input = m.add_parameter("x", s);
-        auto rsp   = m.add_instruction(migraphx::make_op("reshape", {{"dims", {4, -1}}}), input);
-        auto rdm   = m.add_instruction(reduce_op, rsp);
-        auto ret =
-            m.add_instruction(migraphx::make_op("reshape", {{"dims", {2, 2, 1, 1, 1}}}), rdm);
-        m.add_return({ret});
+        auto rdm   = m.add_instruction(reduce_op, input);
+        m.add_return({rdm});
         return m;
     };
 
@@ -78,8 +75,9 @@ TEST_CASE(rewrite_pooling_test)
     };
 
     test_rewrite(migraphx::op::pooling_mode::average,
-                 migraphx::make_op("reduce_mean", {{"axes", {1}}}));
-    test_rewrite(migraphx::op::pooling_mode::max, migraphx::make_op("reduce_max", {{"axes", {1}}}));
+                 migraphx::make_op("reduce_mean", {{"axes", {2, 3, 4}}}));
+    test_rewrite(migraphx::op::pooling_mode::max,
+                 migraphx::make_op("reduce_max", {{"axes", {2, 3, 4}}}));
 }
 
 TEST_CASE(rewrite_avepooling_na1_test)
diff --git a/test/verify/test_reduce_add.cpp b/test/verify/test_reduce_add.cpp
new file mode 100644
index 00000000000..e7c1b56b6c1
--- /dev/null
+++ b/test/verify/test_reduce_add.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+
+struct test_reduce_add : verify_program<test_reduce_add>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {4, 1000, 2, 2}};
+        migraphx::shape bs{migraphx::shape::half_type, {1, 32, 128}};
+        auto x = mm->add_parameter("x", s);
+        auto reduce_mean =
+            mm->add_instruction(migraphx::make_op("reduce_mean", {{"axes", {2, 3}}}), x);
+        auto reduce_max =
+            mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {2, 3}}}), x);
+        auto add = mm->add_instruction(migraphx::make_op("add"), reduce_mean, reduce_max);
+        mm->add_return({add});
+        return p;
+    };
+};
diff --git a/test/verify/test_reduce_noop_add.cpp b/test/verify/test_reduce_noop_add.cpp
new file mode 100644
index 00000000000..090c2a737d7
--- /dev/null
+++ b/test/verify/test_reduce_noop_add.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+
+struct test_reduce_noop_add : verify_program<test_reduce_noop_add>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {4, 1000, 1, 1}};
+        migraphx::shape bs{migraphx::shape::half_type, {1, 32, 128}};
+        auto x = mm->add_parameter("x", s);
+        auto reduce_mean =
+            mm->add_instruction(migraphx::make_op("reduce_mean", {{"axes", {2, 3}}}), x);
+        auto reduce_max =
+            mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {2, 3}}}), x);
+        auto add = mm->add_instruction(migraphx::make_op("add"), reduce_mean, reduce_max);
+        mm->add_return({add});
+        return p;
+    };
+};