From e48e094541875d65a7f1b71b7f39bf07cfb77fcb Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Sun, 1 Sep 2024 16:39:21 +0800
Subject: [PATCH 01/16] pnnx print flops memops count

---
 tools/pnnx/src/ir.cpp   | 27 +++++++++++++++++++++++++++
 tools/pnnx/src/ir.h     |  3 +++
 tools/pnnx/src/main.cpp |  3 +++
 3 files changed, 33 insertions(+)
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 8b2b6dfd2d7f..994371954d6d 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 #include <string>
 #include <stack>
+#include <vector>
 
 #include "storezip.h"
 #include "utils.h"
@@ -1441,6 +1442,32 @@ static std::string make_index_expression(const Operator* op)
     return index_expr;
 }
 
+int Graph::calculate_flops()
+{
+    int flops = 0;
+    for(auto op:ops) {
+        if(expand_expression(op) == "*")
+        {
+            int m = op->inputs[0]->shape[0];
+            int k = op->inputs[0]->shape[1];
+            int n = op->inputs[1]->shape[1];
+            flops += 2 * m * k * n;
+        }
+        else if(expand_expression(op) == "+") {
+            int m = op->inputs[0]->shape[0];
+            int n = op->inputs[0]->shape[1];
+            flops += m * n;
+        }
+    }
+    return flops;
+}
+
+int Graph::calculate_memops()
+{
+    int mem = sizeof(Operator) * ops.size() + sizeof(Operand) * operands.size();
+    return mem;
+}
+
 int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
 {
     FILE* pyfp = fopen(pypath.c_str(), "wb");
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index 779c2eec9f10..91e0e2a69fe3 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -346,6 +346,9 @@ class Graph
     std::vector<Operator*> ops;
     std::vector<Operand*> operands;
 
+    int calculate_flops();
+    int calculate_memops();
+
 private:
     Graph(const Graph& rhs);
     Graph& operator=(const Graph& rhs);
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index c25128032d9e..dda54b1932dd 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -361,6 +361,9 @@ int main(int argc, char** argv)
 
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
+    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops());
+    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops());
+
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
 
 #if BUILD_PNNX2ONNX

From db91abd606ff00d7ebf9bdb2d3349720eb07fd21 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Sun, 1 Sep 2024 16:57:58 +0800
Subject: [PATCH 02/16] pnnx print flops memops count

---
 tools/pnnx/src/ir.cpp | 28 ++++++++++++++++++++++------
 tools/pnnx/src/ir.h   |  4 ++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 994371954d6d..7c5923acac1e 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1442,9 +1442,9 @@ static std::string make_index_expression(const Operator* op)
     return index_expr;
 }
 
-int Graph::calculate_flops()
+int Graph::calculate_flops_M()
 {
-    int flops = 0;
+    long long flops = 0;
     for(auto op:ops) {
         if(expand_expression(op) == "*")
         {
@@ -1459,13 +1459,29 @@ int Graph::calculate_flops()
             flops += m * n;
         }
     }
-    return flops;
+    return int(flops / 1e6);
 }
 
-int Graph::calculate_memops()
+int Graph::calculate_memops_M()
 {
-    int mem = sizeof(Operator) * ops.size() + sizeof(Operand) * operands.size();
-    return mem;
+    long long mem = 0;
+    for(auto op : ops)
+    {
+        if(expand_expression(op) == "*")
+        {
+            int m = op->inputs[0]->shape[0];
+            int k = op->inputs[0]->shape[1];
+            int n = op->inputs[1]->shape[1];
+            mem += m * k + k * n + m * n;
+        }
+        else if(expand_expression(op) == "+")
+        {
+            int m = op->inputs[0]->shape[0];
+            int n = op->inputs[0]->shape[1];
+            mem += 3 * m * n;
+        }
+    }
+    return int(mem / 1e6);
 }
 
 int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index 91e0e2a69fe3..bc1f0089591d 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -346,8 +346,8 @@ class Graph
     std::vector<Operator*> ops;
     std::vector<Operand*> operands;
 
-    int calculate_flops();
-    int calculate_memops();
+    int calculate_flops_M();
+    int calculate_memops_M();
 
 private:
     Graph(const Graph& rhs);

From 4af97a8c0cb75e0ed171e264b176d77f099850dc Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Mon, 2 Sep 2024 10:49:01 +0800
Subject: [PATCH 03/16] pnnx print flops memops count

---
 tools/pnnx/src/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index dda54b1932dd..32e628be8d6f 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -361,8 +361,8 @@ int main(int argc, char** argv)
 
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
-    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops());
-    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops());
+    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
+    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
 
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
 

From a4fd3191d66a8d96e679eb7ae8fdd8b8a6e80d49 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Mon, 2 Sep 2024 17:47:02 +0800
Subject: [PATCH 04/16] pnnx print flops memops count

---
 tools/pnnx/src/main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 32e628be8d6f..5ef47b2409ac 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -313,6 +313,8 @@ int main(int argc, char** argv)
     std::string foldable_constants_zippath = ptbase + ".foldable_constants.zip";
 
     pnnx::Graph pnnx_graph;
+    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
+    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
 #if BUILD_ONNX2PNNX
     if (!model_file_maybe_torchscript(ptpath))
     {
@@ -361,9 +363,6 @@ int main(int argc, char** argv)
 
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
-    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
-    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
-
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
 
 #if BUILD_PNNX2ONNX

From 54659e042f843547f18b4b908da300ae2af89f8d Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Mon, 2 Sep 2024 17:53:29 +0800
Subject: [PATCH 05/16] pnnx print flops memops count

---
 tools/pnnx/src/main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 5ef47b2409ac..23fdc0102224 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -313,8 +313,6 @@ int main(int argc, char** argv)
     std::string foldable_constants_zippath = ptbase + ".foldable_constants.zip";
 
     pnnx::Graph pnnx_graph;
-    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
-    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
 #if BUILD_ONNX2PNNX
     if (!model_file_maybe_torchscript(ptpath))
     {
@@ -384,6 +382,7 @@ int main(int argc, char** argv)
 
     //     pnnx_graph2.load("pnnx.param", "pnnx.bin");
     //     pnnx_graph2.save("pnnx2.param", "pnnx2.bin");
-
+    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
+    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
     return 0;
 }

From 2c80f272cbbe9e2e0ae09321bb1af7eac19e01b0 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Tue, 3 Sep 2024 08:38:27 +0800
Subject: [PATCH 06/16] pnnx print flops memops count

---
 tools/pnnx/src/ir.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 7c5923acac1e..6dd429ebbfb4 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1446,14 +1446,14 @@ int Graph::calculate_flops_M()
 {
     long long flops = 0;
     for(auto op:ops) {
-        if(expand_expression(op) == "*")
+        if(op->type == "aten::matmul")
         {
             int m = op->inputs[0]->shape[0];
             int k = op->inputs[0]->shape[1];
             int n = op->inputs[1]->shape[1];
             flops += 2 * m * k * n;
         }
-        else if(expand_expression(op) == "+") {
+        else if(op->type == "aten::add") {
             int m = op->inputs[0]->shape[0];
             int n = op->inputs[0]->shape[1];
             flops += m * n;
@@ -1467,14 +1467,14 @@ int Graph::calculate_memops_M()
     long long mem = 0;
     for(auto op : ops)
     {
-        if(expand_expression(op) == "*")
+        if(op->type == "aten::matmul")
         {
             int m = op->inputs[0]->shape[0];
             int k = op->inputs[0]->shape[1];
             int n = op->inputs[1]->shape[1];
             mem += m * k + k * n + m * n;
         }
-        else if(expand_expression(op) == "+")
+        else if(op->type == "aten::add")
         {
             int m = op->inputs[0]->shape[0];
             int n = op->inputs[0]->shape[1];

From a89c0f7ce11c58fdef3c292bb198e6c0c741c8cb Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Tue, 3 Sep 2024 09:39:43 +0800
Subject: [PATCH 07/16] pnnx print flops memops count

---
 tools/pnnx/src/ir.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 6dd429ebbfb4..985cf47a3b97 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1459,7 +1459,7 @@ int Graph::calculate_flops_M()
             flops += m * n;
         }
     }
-    return int(flops / 1e6);
+    return int(flops);
 }
 
 int Graph::calculate_memops_M()
@@ -1481,7 +1481,7 @@ int Graph::calculate_memops_M()
             mem += 3 * m * n;
         }
     }
-    return int(mem / 1e6);
+    return int(mem);
 }
 
 int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)

From d247bb9aba088946b0ea86b0fff0bb62fa0f6c82 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Tue, 3 Sep 2024 09:42:58 +0800
Subject: [PATCH 08/16] pnnx print flops memops count

---
 tools/pnnx/src/ir.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 985cf47a3b97..59f26b17f054 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -14,6 +14,7 @@
 
 #include "ir.h"
 
+#include <cstdio>
 #include <limits.h>
 #include <stdint.h>
 #include <string.h>
@@ -1467,6 +1468,7 @@ int Graph::calculate_memops_M()
     long long mem = 0;
     for(auto op : ops)
     {
+        fprintf(stderr, "%s\n", op->type.c_str());
         if(op->type == "aten::matmul")
         {
             int m = op->inputs[0]->shape[0];

From b10faaf661c085179e3a62b33c24cf4826c12e38 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Wed, 11 Sep 2024 17:14:53 +0800
Subject: [PATCH 09/16] test

---
 tools/pnnx/src/ir.cpp   | 148 ++++++++++++++++++++++++++++++++--------
 tools/pnnx/src/main.cpp |   5 +-
 2 files changed, 123 insertions(+), 30 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 59f26b17f054..6fd139627ad7 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1446,21 +1446,75 @@ static std::string make_index_expression(const Operator* op)
 int Graph::calculate_flops_M()
 {
     long long flops = 0;
-    for(auto op:ops) {
-        if(op->type == "aten::matmul")
-        {
-            int m = op->inputs[0]->shape[0];
-            int k = op->inputs[0]->shape[1];
-            int n = op->inputs[1]->shape[1];
-            flops += 2 * m * k * n;
-        }
-        else if(op->type == "aten::add") {
-            int m = op->inputs[0]->shape[0];
-            int n = op->inputs[0]->shape[1];
-            flops += m * n;
-        }
-    }
-    return int(flops);
+    for(auto op:ops)
+    {
+        fprintf(stderr, "op->type %s\n", op->type.c_str());
+        if(op->type[0] == 'F')
+        {
+            std::string sub_type = op->type.substr(2);
+            if(sub_type == "adaptive_avg_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int o = op->params.at("output_size").ai[0];
+                flops += n * c * l * o;
+            }
+            else if(sub_type == "adaptive_avg_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int oh = op->params.at("output_size").ai[0];
+                int ow = op->params.at("output_size").ai[1];
+                flops += n * c * h * w * oh * ow;
+            }
+            else if(sub_type == "adaptive_avg_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int od = op->params.at("output_size").ai[0];
+                int oh = op->params.at("output_size").ai[1];
+                int ow = op->params.at("output_size").ai[2];
+                flops += n * c * d * h * w * od * oh * ow;
+            }
+            else if(sub_type == "adaptive_max_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int o = op->params.at("output_size").ai[0];
+                flops += n * c * l * o;
+            }
+            else if(sub_type == "adaptive_max_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int oh = op->params.at("output_size").ai[0];
+                int ow = op->params.at("output_size").ai[1];
+                flops += n * c * h * w * oh * ow;
+            }
+            else if(sub_type == "adaptive_max_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int od = op->params.at("output_size").ai[0];
+                int oh = op->params.at("output_size").ai[1];
+                int ow = op->params.at("output_size").ai[2];
+                flops += n * c * d * h * w * od * oh * ow;
+            }
+        }
+    }
+    return int(flops / 1e6);
 }
 
 int Graph::calculate_memops_M()
@@ -1468,22 +1522,60 @@ int Graph::calculate_memops_M()
     long long mem = 0;
     for(auto op : ops)
     {
-        fprintf(stderr, "%s\n", op->type.c_str());
-        if(op->type == "aten::matmul")
+        if(op->type[0] == 'F')
         {
-            int m = op->inputs[0]->shape[0];
-            int k = op->inputs[0]->shape[1];
-            int n = op->inputs[1]->shape[1];
-            mem += m * k + k * n + m * n;
-        }
-        else if(op->type == "aten::add")
-        {
-            int m = op->inputs[0]->shape[0];
-            int n = op->inputs[0]->shape[1];
-            mem += 3 * m * n;
+            std::string sub_type = op->type.substr(2);
+            if(sub_type == "adaptive_avg_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int o = op->params.at("output_size").ai[0];
+                mem += n * c * l * o;
+            }
+            else if(sub_type == "adaptive_avg_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int oh = op->params.at("output_size").ai[0];
+                int ow = op->params.at("output_size").ai[1];
+                mem += n * c * h * w * oh * ow;
+            }
+            else if(sub_type == "adaptive_avg_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int od = op->params.at("output_size").ai[0];
+                int oh = op->params.at("output_size").ai[1];
+                int ow = op->params.at("output_size").ai[2];
+                mem += n * c * d * h * w * od * oh * ow;
+            }
+            else if(sub_type == "adaptive_max_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int o = op->params.at("output_size").ai[0];
+                mem += n * c * l * o;
+            }
+            else if(sub_type == "adaptive_max_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int oh = op->params.at("output_size").ai[0];
+                int ow = op->params.at("output_size").ai[1];
+                mem += n * c * h * w * oh * ow;
+            }
         }
     }
-    return int(mem);
+    return int(mem / 1e6);
 }
 
 int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 23fdc0102224..5f5cb3aa7fcd 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -362,6 +362,9 @@ int main(int argc, char** argv)
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
+    
+    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
+    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
 
 #if BUILD_PNNX2ONNX
     pnnx::save_onnx(pnnx_graph, pnnxonnxpath.c_str(), fp16);
@@ -382,7 +385,5 @@ int main(int argc, char** argv)
 
     //     pnnx_graph2.load("pnnx.param", "pnnx.bin");
     //     pnnx_graph2.save("pnnx2.param", "pnnx2.bin");
-    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
-    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
     return 0;
 }

From ef1e8dfcfd82b201c6c811870577bc309df147d8 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Wed, 11 Sep 2024 20:17:11 +0800
Subject: [PATCH 10/16] test

---
 tools/pnnx/src/ir.cpp   | 66 ++++++++---------------------------------
 tools/pnnx/src/ir.h     |  5 ++--
 tools/pnnx/src/main.cpp |  5 ++--
 3 files changed, 19 insertions(+), 57 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 6fd139627ad7..c01b66b2c31a 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1443,12 +1443,10 @@ static std::string make_index_expression(const Operator* op)
     return index_expr;
 }
 
-int Graph::calculate_flops_M()
+void Graph::flops_memops_sum()
 {
-    long long flops = 0;
     for(auto op:ops)
     {
-        fprintf(stderr, "op->type %s\n", op->type.c_str());
         if(op->type[0] == 'F')
         {
             std::string sub_type = op->type.substr(2);
@@ -1459,6 +1457,7 @@ int Graph::calculate_flops_M()
                 int l = op->inputs[0]->shape[2];
                 int o = op->params.at("output_size").ai[0];
                 flops += n * c * l * o;
+                memops += n * c * l + n * c * o;
             }
             else if(sub_type == "adaptive_avg_pool2d")
             {
@@ -1469,6 +1468,7 @@ int Graph::calculate_flops_M()
                 int oh = op->params.at("output_size").ai[0];
                 int ow = op->params.at("output_size").ai[1];
                 flops += n * c * h * w * oh * ow;
+                memops += n * c * h * w + n * c * oh * ow;
             }
             else if(sub_type == "adaptive_avg_pool3d")
             {
@@ -1481,6 +1481,7 @@ int Graph::calculate_flops_M()
                 int oh = op->params.at("output_size").ai[1];
                 int ow = op->params.at("output_size").ai[2];
                 flops += n * c * d * h * w * od * oh * ow;
+                memops += n * c * d * h * w + n * c * od * oh * ow;
             }
             else if(sub_type == "adaptive_max_pool1d")
             {
@@ -1489,6 +1490,7 @@ int Graph::calculate_flops_M()
                 int l = op->inputs[0]->shape[2];
                 int o = op->params.at("output_size").ai[0];
                 flops += n * c * l * o;
+                memops += n * c * l + n * c * o;
             }
             else if(sub_type == "adaptive_max_pool2d")
             {
@@ -1499,6 +1501,7 @@ int Graph::calculate_flops_M()
                 int oh = op->params.at("output_size").ai[0];
                 int ow = op->params.at("output_size").ai[1];
                 flops += n * c * h * w * oh * ow;
+                memops += n * c * h * w + n * c * oh * ow;
             }
             else if(sub_type == "adaptive_max_pool3d")
             {
@@ -1511,71 +1514,28 @@ int Graph::calculate_flops_M()
                 int oh = op->params.at("output_size").ai[1];
                 int ow = op->params.at("output_size").ai[2];
                 flops += n * c * d * h * w * od * oh * ow;
+                memops += n * c * d * h * w + n * c * od * oh * ow;
             }
-        }
-    }
-    return int(flops / 1e6);
-}
-
-int Graph::calculate_memops_M()
-{
-    long long mem = 0;
-    for(auto op : ops)
-    {
-        if(op->type[0] == 'F')
-        {
-            std::string sub_type = op->type.substr(2);
-            if(sub_type == "adaptive_avg_pool1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int o = op->params.at("output_size").ai[0];
-                mem += n * c * l * o;
-            }
-            else if(sub_type == "adaptive_avg_pool2d")
+            else if(sub_type == "celu")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                int oh = op->params.at("output_size").ai[0];
-                int ow = op->params.at("output_size").ai[1];
-                mem += n * c * h * w * oh * ow;
-            }
-            else if(sub_type == "adaptive_avg_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int od = op->params.at("output_size").ai[0];
-                int oh = op->params.at("output_size").ai[1];
-                int ow = op->params.at("output_size").ai[2];
-                mem += n * c * d * h * w * od * oh * ow;
+                flops += n * c * h * w;
+                memops += 2 * n * c * h * w;
             }
-            else if(sub_type == "adaptive_max_pool1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int o = op->params.at("output_size").ai[0];
-                mem += n * c * l * o;
-            }
-            else if(sub_type == "adaptive_max_pool2d")
+            else if(sub_type == "elu")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                int oh = op->params.at("output_size").ai[0];
-                int ow = op->params.at("output_size").ai[1];
-                mem += n * c * h * w * oh * ow;
+                flops += n * c * h * w;
+                memops += 2 * n * c * h * w;
             }
         }
     }
-    return int(mem / 1e6);
 }
 
 int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index bc1f0089591d..c66141d7324c 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -346,8 +346,9 @@ class Graph
     std::vector<Operator*> ops;
     std::vector<Operand*> operands;
 
-    int calculate_flops_M();
-    int calculate_memops_M();
+    long long flops = 0;
+    long long memops = 0;
+    void flops_memops_sum();
 
 private:
     Graph(const Graph& rhs);
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 5f5cb3aa7fcd..f75af022cdeb 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -363,8 +363,9 @@ int main(int argc, char** argv)
 
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
     
-    fprintf(stderr, "float ops = %dM\n", pnnx_graph.calculate_flops_M());
-    fprintf(stderr, "memory ops = %dM\n", pnnx_graph.calculate_memops_M());
+    pnnx_graph.flops_memops_sum();
+    fprintf(stderr, "float ops = %.3fM\n", double(pnnx_graph.flops) / 1e6);
+    fprintf(stderr, "mem ops = %.3fM\n", double(pnnx_graph.memops) / 1e6);
 
 #if BUILD_PNNX2ONNX
     pnnx::save_onnx(pnnx_graph, pnnxonnxpath.c_str(), fp16);

From b977f730f5ea119315711d891f22234ec91b3251 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Wed, 11 Sep 2024 20:35:12 +0800
Subject: [PATCH 11/16] test

---
 tools/pnnx/src/ir.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index c01b66b2c31a..64495974a639 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1447,6 +1447,7 @@ void Graph::flops_memops_sum()
 {
     for(auto op:ops)
     {
+        fprintf(stderr, "op->type: %s\n", op->type.c_str());
         if(op->type[0] == 'F')
         {
             std::string sub_type = op->type.substr(2);

From 9f4180002f838924958fed329d5831b8b248ba1a Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Thu, 12 Sep 2024 17:35:19 +0800
Subject: [PATCH 12/16] test

---
 tools/pnnx/src/ir.cpp   | 565 ++++++++++++++++++++++++++++++++++++++--
 tools/pnnx/src/ir.h     |   6 +-
 tools/pnnx/src/main.cpp |   2 +
 3 files changed, 548 insertions(+), 25 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 64495974a639..5125ac1d1ad7 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -23,7 +23,6 @@
 #include <sstream>
 #include <string>
 #include <stack>
-#include <vector>
 
 #include "storezip.h"
 #include "utils.h"
@@ -1445,13 +1444,13 @@ static std::string make_index_expression(const Operator* op)
 
 void Graph::flops_memops_sum()
 {
-    for(auto op:ops)
+    for (auto op : ops)
     {
         fprintf(stderr, "op->type: %s\n", op->type.c_str());
-        if(op->type[0] == 'F')
+        if (op->type[0] == 'F')
         {
             std::string sub_type = op->type.substr(2);
-            if(sub_type == "adaptive_avg_pool1d")
+            if (sub_type == "adaptive_avg_pool1d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1460,7 +1459,7 @@ void Graph::flops_memops_sum()
                 flops += n * c * l * o;
                 memops += n * c * l + n * c * o;
             }
-            else if(sub_type == "adaptive_avg_pool2d")
+            else if (sub_type == "adaptive_avg_pool2d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1471,7 +1470,7 @@ void Graph::flops_memops_sum()
                 flops += n * c * h * w * oh * ow;
                 memops += n * c * h * w + n * c * oh * ow;
             }
-            else if(sub_type == "adaptive_avg_pool3d")
+            else if (sub_type == "adaptive_avg_pool3d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1484,7 +1483,58 @@ void Graph::flops_memops_sum()
                 flops += n * c * d * h * w * od * oh * ow;
                 memops += n * c * d * h * w + n * c * od * oh * ow;
             }
-            else if(sub_type == "adaptive_max_pool1d")
+            else if (sub_type == "avg_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->params.at("kernel_size").ai[0];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int o = (l + 2 * p - k) / s + 1;
+                flops += n * c * l * k;
+                memops += n * c * l + n * c * o;
+            }
+            else if (sub_type == "avg_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int kh = op->params.at("kernel_size").ai[0];
+                int kw = op->params.at("kernel_size").ai[1];
+                int sh = op->params.at("stride").ai[0];
+                int sw = op->params.at("stride").ai[1];
+                int ph = op->params.at("padding").ai[0];
+                int pw = op->params.at("padding").ai[1];
+                int oh = (h + 2 * ph - kh) / sh + 1;
+                int ow = (w + 2 * pw - kw) / sw + 1;
+                flops += n * c * h * w * kh * kw;
+                memops += n * c * h * w + n * c * oh * ow;
+            }
+            else if (sub_type == "avg_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->params.at("kernel_size").ai[0];
+                int kh = op->params.at("kernel_size").ai[1];
+                int kw = op->params.at("kernel_size").ai[2];
+                int sd = op->params.at("stride").ai[0];
+                int sh = op->params.at("stride").ai[1];
+                int sw = op->params.at("stride").ai[2];
+                int pd = op->params.at("padding").ai[0];
+                int ph = op->params.at("padding").ai[1];
+                int pw = op->params.at("padding").ai[2];
+                int od = (d + 2 * pd - kd) / sd + 1;
+                int oh = (h + 2 * ph - kh) / sh + 1;
+                int ow = (w + 2 * pw - kw) / sw + 1;
+                flops += n * c * d * h * w * kd * kh * kw;
+                memops += n * c * d * h * w + n * c * od * oh * ow;
+            }
+            else if (sub_type == "adaptive_max_pool1d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1493,7 +1543,7 @@ void Graph::flops_memops_sum()
                 flops += n * c * l * o;
                 memops += n * c * l + n * c * o;
             }
-            else if(sub_type == "adaptive_max_pool2d")
+            else if (sub_type == "adaptive_max_pool2d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1504,7 +1554,7 @@ void Graph::flops_memops_sum()
                 flops += n * c * h * w * oh * ow;
                 memops += n * c * h * w + n * c * oh * ow;
             }
-            else if(sub_type == "adaptive_max_pool3d")
+            else if (sub_type == "adaptive_max_pool3d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
@@ -1517,23 +1567,492 @@ void Graph::flops_memops_sum()
                 flops += n * c * d * h * w * od * oh * ow;
                 memops += n * c * d * h * w + n * c * od * oh * ow;
             }
-            else if(sub_type == "celu")
+            else if (sub_type == "max_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->params.at("kernel_size").ai[0];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int o = (l + 2 * p - k) / s + 1;
+                flops += n * c * l * k;
+                memops += n * c * l + n * c * o;
+            }
+            else if (sub_type == "max_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int kh = op->params.at("kernel_size").ai[0];
+                int kw = op->params.at("kernel_size").ai[1];
+                int sh = op->params.at("stride").ai[0];
+                int sw = op->params.at("stride").ai[1];
+                int ph = op->params.at("padding").ai[0];
+                int pw = op->params.at("padding").ai[1];
+                int oh = (h + 2 * ph - kh) / sh + 1;
+                int ow = (w + 2 * pw - kw) / sw + 1;
+                flops += n * c * h * w * kh * kw;
+                memops += n * c * h * w + n * c * oh * ow;
+            }
+            else if (sub_type == "max_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->params.at("kernel_size").ai[0];
+                int kh = op->params.at("kernel_size").ai[1];
+                int kw = op->params.at("kernel_size").ai[2];
+                int sd = op->params.at("stride").ai[0];
+                int sh = op->params.at("stride").ai[1];
+                int sw = op->params.at("stride").ai[2];
+                int pd = op->params.at("padding").ai[0];
+                int ph = op->params.at("padding").ai[1];
+                int pw = op->params.at("padding").ai[2];
+                int od = (d + 2 * pd - kd) / sd + 1;
+                int oh = (h + 2 * ph - kh) / sh + 1;
+                int ow = (w + 2 * pw - kw) / sw + 1;
+                flops += n * c * d * h * w * kd * kh * kw;
+                memops += n * c * d * h * w + n * c * od * oh * ow;
+            }
+            else if (sub_type == "lp_pool1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->params.at("kernel_size").i;
+                int p = op->params.at("p").i;
+                if (p == 1)
+                {
+                    extra_flops += 2 * n * c * l * k;
+                }
+                else if (p == 2)
+                {
+                    extra_flops += 3 * n * c * l * k;
+                }
+                extra_memops += 2 * n * c * l;
+            }
+            else if (sub_type == "lp_pool2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int kh = op->params.at("kernel_size").ai[0];
+                int kw = op->params.at("kernel_size").ai[1];
+                int p = op->params.at("p").i;
+                if (p == 1)
+                {
+                    extra_flops += 2 * n * c * h * w * kh * kw;
+                }
+                else if (p == 2)
+                {
+                    extra_flops += 3 * n * c * h * w * kh * kw;
+                }
+                extra_memops += 2 * n * c * h * w;
+            }
+            else if (sub_type == "lp_pool3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->params.at("kernel_size").ai[0];
+                int kh = op->params.at("kernel_size").ai[1];
+                int kw = op->params.at("kernel_size").ai[2];
+                int p = op->params.at("p").i;
+                if (p == 1)
+                {
+                    extra_flops += 2 * n * c * d * h * w * kd * kh * kw;
+                }
+                else if (p == 2)
+                {
+                    extra_flops += 3 * n * c * d * h * w * kd * kh * kw;
+                }
+                extra_memops += 2 * n * c * d * h * w;
+            }
+            else if (
+                sub_type == "elu" ||
+                sub_type == "celu" ||
+                sub_type == "gelu" ||
+                sub_type == "glu" ||
+                sub_type == "hardshrink" ||
+                sub_type == "hardsigmoid" ||
+                sub_type == "hardswish" ||
+                sub_type == "hardtanh" ||
+                sub_type == "leaky_relu" ||
+                sub_type == "prelu" ||
+                sub_type == "relu" ||
+                sub_type == "relu6" ||
+                sub_type == "rrelu" ||
+                sub_type == "mish" ||
+                sub_type == "normalize" ||
+                sub_type == "batch_norm" ||
+                sub_type == "group_norm" ||
+                sub_type == "instance_norm" ||
+                sub_type == "layer_norm"
+                )
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int num_elements = 1;
+                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                {
+                    num_elements *= op->inputs[0]->shape[i];
+                }
+                if(sub_type == "elu")
+                {
+                    extra_flops += 2 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "celu")
+                {
+                    extra_flops += 3 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "gelu")
+                {
+                    extra_flops += 3 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "glu")
+                {
+                    int l = op->inputs[0]->shape[2];
+                    int o = op->outputs[0]->shape[2];
+                    extra_flops += n * c * l * o;
+                    extra_memops += 2 * n * c * l + n * o;
+                }
+                else if(sub_type == "hardshrink")
+                {
+                    extra_flops += 2 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "hardsigmoid")
+                {
+                    extra_flops += 6 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "hardswish")
+                {
+                    extra_flops += 5 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "hardtanh")
+                {
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "leaky_relu")
+                {
+                    extra_flops += 2 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "prelu")
+                {
+                    extra_flops += 2 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "relu")
+                {
+                    extra_flops += n * c * num_elements;
+                    extra_memops += n * c * num_elements;
+                }
+                else if(sub_type == "relu6")
+                {
+                    extra_memops += n * c * num_elements;
+                }
+                else if(sub_type == "rrelu")
+                {
+                    extra_flops += n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "mish")
+                {
+                    extra_flops += 2 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(sub_type == "normalize")
+                {
+                    extra_flops += 7 * n * c * num_elements + 3;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+                else if(
+                    sub_type == "batch_norm" ||
+                    sub_type == "group_norm" ||
+                    sub_type == "instance_norm" ||
+                    sub_type == "layer_norm"
+                    )
+                {
+                    extra_flops += 7 * n * c * num_elements;
+                    extra_memops += 2 * n * c * num_elements;
+                }
+            }
+            else if (sub_type == "conv1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->inputs[1]->shape[0];
+                int o = op->outputs[0]->shape[2];
+                flops += 2 * n * c * l * k * o;
+                memops += 2 * n * c * l * k + n * o;
+            }
+            else if (sub_type == "conv2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int kh = op->inputs[1]->shape[2];
+                int kw = op->inputs[1]->shape[3];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * h * w * kh * kw * o / g;
+                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
+            }
+            else if (sub_type == "conv3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->inputs[1]->shape[2];
+                int kh = op->inputs[1]->shape[3];
+                int kw = op->inputs[1]->shape[4];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
+                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+            }
+            else if (sub_type == "conv_transpose1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->inputs[1]->shape[0];
+                int o = op->outputs[0]->shape[2];
+                flops += 2 * n * c * l * k * o;
+                memops += 2 * n * c * l * k + n * o;
+            }
+            else if (sub_type == "conv_transpose2d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int h = op->inputs[0]->shape[2];
+                int w = op->inputs[0]->shape[3];
+                int kh = op->inputs[1]->shape[2];
+                int kw = op->inputs[1]->shape[3];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * h * w * kh * kw * o / g;
+                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
+            }
+            else if (sub_type == "conv_transpose3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->inputs[1]->shape[2];
+                int kh = op->inputs[1]->shape[3];
+                int kw = op->inputs[1]->shape[4];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
+                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+            }
+            else if (sub_type == "embedding")
+            {
+                int n = op->inputs[0]->shape[0];
+                int l = op->inputs[0]->shape[1];
+                int c = op->params.at("num_embeddings").i;
+                int e = op->params.at("embedding_dim").i;
+                extra_flops += n * l * e;
+                extra_memops += n * l + n * e;
+            }
+            else if (sub_type == "linear")
+            {
+                int n = op->inputs[0]->shape[0];
+                int i = op->inputs[0]->shape[1];
+                int o = op->outputs[0]->shape[1];
+                flops += 2 * n * i * o;
+                memops += 2 * n * i + n * o;
+            }
+            else if (sub_type == "log_softmax")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                extra_flops += 2 * n * c * l;
+                extra_memops += 2 * n * c * l;
+            }
+            else if (sub_type == "logsigmoid")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                extra_flops += 2 * n * c * l;
+                extra_memops += 2 * n * c * l;
+            }
+            else if (sub_type == "scaled_dot_product_attention")
+            {
+                int n = op->inputs[0]->shape[0];
+                int l = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                flops += 2 * n * l * l + n * l * d + n * l * l * d;
+                memops += 2 * n * l * d + 3 * n * l * l + n * l;
+            }
+        }
+
+        else if (op->type.substr(0, 2) == "nn")
+        {
+            std::string sub_type = op->type.substr(3);
+            if (
+                sub_type == "BatchNorm1d" ||
+                sub_type == "BatchNorm2d" ||
+                sub_type == "BatchNorm3d"
+                )
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int num_elements = 1;
+                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                {
+                    num_elements *= op->inputs[0]->shape[i];
+                }
+                extra_flops += 7 * n * c * num_elements;
+                extra_memops += 2 * n * c * num_elements;
+            }
+            else if (sub_type == "Conv1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->inputs[1]->shape[0];
+                int o = op->outputs[0]->shape[2];
+                flops += 2 * n * c * l * k * o;
+                memops += 2 * n * c * l * k + n * o;
+            }
+            else if (sub_type == "Conv2d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                flops += n * c * h * w;
-                memops += 2 * n * c * h * w;
+                int kh = op->inputs[1]->shape[2];
+                int kw = op->inputs[1]->shape[3];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * h * w * kh * kw * o / g;
+                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
+            }
+            else if (sub_type == "Conv3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->inputs[1]->shape[2];
+                int kh = op->inputs[1]->shape[3];
+                int kw = op->inputs[1]->shape[4];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
+                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+            }
+            else if (sub_type == "ConvTranspose1d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int l = op->inputs[0]->shape[2];
+                int k = op->inputs[1]->shape[0];
+                int o = op->outputs[0]->shape[2];
+                flops += 2 * n * c * l * k * o;
+                memops += 2 * n * c * l * k + n * o;
             }
-            else if(sub_type == "elu")
+            else if (sub_type == "ConvTranspose2d")
             {
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                flops += n * c * h * w;
-                memops += 2 * n * c * h * w;
+                int kh = op->inputs[1]->shape[2];
+                int kw = op->inputs[1]->shape[3];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * h * w * kh * kw * o / g;
+                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
+            }
+            else if (sub_type == "PReLU")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int num_elements = 1;
+                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                {
+                    num_elements *= op->inputs[0]->shape[i];
+                }
+                extra_flops += 2 * n * c * num_elements;
+                extra_memops += 2 * n * c * num_elements;
+            }
+            else if (sub_type == "ConvTranspose3d")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int d = op->inputs[0]->shape[2];
+                int h = op->inputs[0]->shape[3];
+                int w = op->inputs[0]->shape[4];
+                int kd = op->inputs[1]->shape[2];
+                int kh = op->inputs[1]->shape[3];
+                int kw = op->inputs[1]->shape[4];
+                int o = op->outputs[0]->shape[2];
+                int s = op->params.at("stride").ai[0];
+                int p = op->params.at("padding").ai[0];
+                int g = op->params.at("groups").i;
+                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
+                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+            }
+            else if (sub_type == "Embedding")
+            {
+                int n = op->inputs[0]->shape[0];
+                int l = op->inputs[0]->shape[1];
+                int c = op->params.at("num_embeddings").i;
+                int e = op->params.at("embedding_dim").i;
+                extra_flops += 2 * n * l * e;
+                extra_memops += 2 * n * l + n * e;
+            }
+            else if (sub_type == "GroupNorm" || sub_type == "InstanceNorm" || sub_type == "LayerNorm")
+            {
+                int n = op->inputs[0]->shape[0];
+                int c = op->inputs[0]->shape[1];
+                int num_elements = 1;
+                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                {
+                    num_elements *= op->inputs[0]->shape[i];
+                }
+
+                extra_flops += 7 * n * c * num_elements;
+                extra_memops += 2 * n * c * num_elements;
             }
         }
     }
@@ -1630,10 +2149,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                     for (size_t i = 0; i < param.ai.size(); i++)
                     {
                         if ((op->type == "nn.AdaptiveAvgPool2d"
-                                || op->type == "nn.AdaptiveAvgPool3d"
-                                || op->type == "nn.AdaptiveMaxPool2d"
-                                || op->type == "nn.AdaptiveMaxPool3d")
-                                && it.first == "output_size" && param.ai[i] == 0)
+                             || op->type == "nn.AdaptiveAvgPool3d"
+                             || op->type == "nn.AdaptiveMaxPool2d"
+                             || op->type == "nn.AdaptiveMaxPool3d")
+                            && it.first == "output_size" && param.ai[i] == 0)
                         {
                             fprintf(pyfp, "None");
                         }
@@ -2386,10 +2905,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                         for (size_t i = 0; i < param.ai.size(); i++)
                         {
                             if ((op->type == "F.adaptive_avg_pool2d"
-                                    || op->type == "F.adaptive_avg_pool3d"
-                                    || op->type == "F.adaptive_max_pool2d"
-                                    || op->type == "F.adaptive_max_pool3d")
-                                    && it.first == "output_size" && param.ai[i] == 0)
+                                 || op->type == "F.adaptive_avg_pool3d"
+                                 || op->type == "F.adaptive_max_pool2d"
+                                 || op->type == "F.adaptive_max_pool3d")
+                                && it.first == "output_size" && param.ai[i] == 0)
                             {
                                 fprintf(pyfp, "None");
                             }
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index c66141d7324c..37ee81e0a6b5 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -346,8 +346,10 @@ class Graph
     std::vector<Operator*> ops;
     std::vector<Operand*> operands;
 
-    long long flops = 0;
-    long long memops = 0;
+    unsigned long long flops = 0;
+    unsigned long long memops = 0;
+    unsigned long long extra_flops = 0;
+    unsigned long long extra_memops = 0;
     void flops_memops_sum();
 
 private:
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index f75af022cdeb..949680faab82 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -366,6 +366,8 @@ int main(int argc, char** argv)
     pnnx_graph.flops_memops_sum();
     fprintf(stderr, "float ops = %.3fM\n", double(pnnx_graph.flops) / 1e6);
     fprintf(stderr, "mem ops = %.3fM\n", double(pnnx_graph.memops) / 1e6);
+    fprintf(stderr, "extra float ops = %.3fM\n", double(pnnx_graph.extra_flops) / 1e6);
+    fprintf(stderr, "extra mem ops = %.3fM\n", double(pnnx_graph.extra_memops) / 1e6);
 
 #if BUILD_PNNX2ONNX
     pnnx::save_onnx(pnnx_graph, pnnxonnxpath.c_str(), fp16);

From a91dc5ce90bcd677a92f92916b70f094dbdfc23b Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Mon, 16 Sep 2024 20:51:36 +0800
Subject: [PATCH 13/16] nn part finished

---
 tools/pnnx/src/ir.cpp | 722 +++++++++++++++++++++++-------------------
 1 file changed, 388 insertions(+), 334 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 5125ac1d1ad7..6cbf320acf41 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -14,8 +14,11 @@
 
 #include "ir.h"
 
+#include <cstddef>
 #include <cstdio>
+#include <functional>
 #include <limits.h>
+#include <numeric>
 #include <stdint.h>
 #include <string.h>
 #include <algorithm>
@@ -23,6 +26,7 @@
 #include <sstream>
 #include <string>
 #include <stack>
+#include <vector>
 
 #include "storezip.h"
 #include "utils.h"
@@ -1488,9 +1492,9 @@ void Graph::flops_memops_sum()
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int l = op->inputs[0]->shape[2];
-                int k = op->params.at("kernel_size").ai[0];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
+                int k = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
                 int o = (l + 2 * p - k) / s + 1;
                 flops += n * c * l * k;
                 memops += n * c * l + n * c * o;
@@ -1501,12 +1505,12 @@ void Graph::flops_memops_sum()
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                int kh = op->params.at("kernel_size").ai[0];
-                int kw = op->params.at("kernel_size").ai[1];
-                int sh = op->params.at("stride").ai[0];
-                int sw = op->params.at("stride").ai[1];
-                int ph = op->params.at("padding").ai[0];
-                int pw = op->params.at("padding").ai[1];
+                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
+                int sh = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int sw = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
+                int ph = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int pw = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
                 int oh = (h + 2 * ph - kh) / sh + 1;
                 int ow = (w + 2 * pw - kw) / sw + 1;
                 flops += n * c * h * w * kh * kw;
@@ -1519,15 +1523,15 @@ void Graph::flops_memops_sum()
                 int d = op->inputs[0]->shape[2];
                 int h = op->inputs[0]->shape[3];
                 int w = op->inputs[0]->shape[4];
-                int kd = op->params.at("kernel_size").ai[0];
-                int kh = op->params.at("kernel_size").ai[1];
-                int kw = op->params.at("kernel_size").ai[2];
-                int sd = op->params.at("stride").ai[0];
-                int sh = op->params.at("stride").ai[1];
-                int sw = op->params.at("stride").ai[2];
-                int pd = op->params.at("padding").ai[0];
-                int ph = op->params.at("padding").ai[1];
-                int pw = op->params.at("padding").ai[2];
+                int kd = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
+                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[2] : 1;
+                int sd = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int sh = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
+                int sw = op->has_param("stride") ? op->params.at("stride").ai[2] : 1;
+                int pd = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int ph = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
+                int pw = op->has_param("padding") ? op->params.at("padding").ai[2] : 0;
                 int od = (d + 2 * pd - kd) / sd + 1;
                 int oh = (h + 2 * ph - kh) / sh + 1;
                 int ow = (w + 2 * pw - kw) / sw + 1;
@@ -1572,9 +1576,9 @@ void Graph::flops_memops_sum()
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int l = op->inputs[0]->shape[2];
-                int k = op->params.at("kernel_size").ai[0];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
+                int k = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
                 int o = (l + 2 * p - k) / s + 1;
                 flops += n * c * l * k;
                 memops += n * c * l + n * c * o;
@@ -1585,12 +1589,12 @@ void Graph::flops_memops_sum()
                 int c = op->inputs[0]->shape[1];
                 int h = op->inputs[0]->shape[2];
                 int w = op->inputs[0]->shape[3];
-                int kh = op->params.at("kernel_size").ai[0];
-                int kw = op->params.at("kernel_size").ai[1];
-                int sh = op->params.at("stride").ai[0];
-                int sw = op->params.at("stride").ai[1];
-                int ph = op->params.at("padding").ai[0];
-                int pw = op->params.at("padding").ai[1];
+                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
+                int sh = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int sw = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
+                int ph = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int pw = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
                 int oh = (h + 2 * ph - kh) / sh + 1;
                 int ow = (w + 2 * pw - kw) / sw + 1;
                 flops += n * c * h * w * kh * kw;
@@ -1603,192 +1607,24 @@ void Graph::flops_memops_sum()
                 int d = op->inputs[0]->shape[2];
                 int h = op->inputs[0]->shape[3];
                 int w = op->inputs[0]->shape[4];
-                int kd = op->params.at("kernel_size").ai[0];
-                int kh = op->params.at("kernel_size").ai[1];
-                int kw = op->params.at("kernel_size").ai[2];
-                int sd = op->params.at("stride").ai[0];
-                int sh = op->params.at("stride").ai[1];
-                int sw = op->params.at("stride").ai[2];
-                int pd = op->params.at("padding").ai[0];
-                int ph = op->params.at("padding").ai[1];
-                int pw = op->params.at("padding").ai[2];
+                int kd = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
+                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
+                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[2] : 1;
+                int sd = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int sh = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
+                int sw = op->has_param("stride") ? op->params.at("stride").ai[2] : 1;
+                int pd = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int ph = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
+                int pw = op->has_param("padding") ? op->params.at("padding").ai[2] : 0;
                 int od = (d + 2 * pd - kd) / sd + 1;
                 int oh = (h + 2 * ph - kh) / sh + 1;
                 int ow = (w + 2 * pw - kw) / sw + 1;
                 flops += n * c * d * h * w * kd * kh * kw;
                 memops += n * c * d * h * w + n * c * od * oh * ow;
             }
-            else if (sub_type == "lp_pool1d")
+            else if (sub_type == "prelu" || sub_type == "leaky_relu")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->params.at("kernel_size").i;
-                int p = op->params.at("p").i;
-                if (p == 1)
-                {
-                    extra_flops += 2 * n * c * l * k;
-                }
-                else if (p == 2)
-                {
-                    extra_flops += 3 * n * c * l * k;
-                }
-                extra_memops += 2 * n * c * l;
-            }
-            else if (sub_type == "lp_pool2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->params.at("kernel_size").ai[0];
-                int kw = op->params.at("kernel_size").ai[1];
-                int p = op->params.at("p").i;
-                if (p == 1)
-                {
-                    extra_flops += 2 * n * c * h * w * kh * kw;
-                }
-                else if (p == 2)
-                {
-                    extra_flops += 3 * n * c * h * w * kh * kw;
-                }
-                extra_memops += 2 * n * c * h * w;
-            }
-            else if (sub_type == "lp_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->params.at("kernel_size").ai[0];
-                int kh = op->params.at("kernel_size").ai[1];
-                int kw = op->params.at("kernel_size").ai[2];
-                int p = op->params.at("p").i;
-                if (p == 1)
-                {
-                    extra_flops += 2 * n * c * d * h * w * kd * kh * kw;
-                }
-                else if (p == 2)
-                {
-                    extra_flops += 3 * n * c * d * h * w * kd * kh * kw;
-                }
-                extra_memops += 2 * n * c * d * h * w;
-            }
-            else if (
-                sub_type == "elu" ||
-                sub_type == "celu" ||
-                sub_type == "gelu" ||
-                sub_type == "glu" ||
-                sub_type == "hardshrink" ||
-                sub_type == "hardsigmoid" ||
-                sub_type == "hardswish" ||
-                sub_type == "hardtanh" ||
-                sub_type == "leaky_relu" ||
-                sub_type == "prelu" ||
-                sub_type == "relu" ||
-                sub_type == "relu6" ||
-                sub_type == "rrelu" ||
-                sub_type == "mish" ||
-                sub_type == "normalize" ||
-                sub_type == "batch_norm" ||
-                sub_type == "group_norm" ||
-                sub_type == "instance_norm" ||
-                sub_type == "layer_norm"
-                )
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int num_elements = 1;
-                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
-                {
-                    num_elements *= op->inputs[0]->shape[i];
-                }
-                if(sub_type == "elu")
-                {
-                    extra_flops += 2 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "celu")
-                {
-                    extra_flops += 3 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "gelu")
-                {
-                    extra_flops += 3 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "glu")
-                {
-                    int l = op->inputs[0]->shape[2];
-                    int o = op->outputs[0]->shape[2];
-                    extra_flops += n * c * l * o;
-                    extra_memops += 2 * n * c * l + n * o;
-                }
-                else if(sub_type == "hardshrink")
-                {
-                    extra_flops += 2 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "hardsigmoid")
-                {
-                    extra_flops += 6 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "hardswish")
-                {
-                    extra_flops += 5 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "hardtanh")
-                {
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "leaky_relu")
-                {
-                    extra_flops += 2 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "prelu")
-                {
-                    extra_flops += 2 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "relu")
-                {
-                    extra_flops += n * c * num_elements;
-                    extra_memops += n * c * num_elements;
-                }
-                else if(sub_type == "relu6")
-                {
-                    extra_memops += n * c * num_elements;
-                }
-                else if(sub_type == "rrelu")
-                {
-                    extra_flops += n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "mish")
-                {
-                    extra_flops += 2 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(sub_type == "normalize")
-                {
-                    extra_flops += 7 * n * c * num_elements + 3;
-                    extra_memops += 2 * n * c * num_elements;
-                }
-                else if(
-                    sub_type == "batch_norm" ||
-                    sub_type == "group_norm" ||
-                    sub_type == "instance_norm" ||
-                    sub_type == "layer_norm"
-                    )
-                {
-                    extra_flops += 7 * n * c * num_elements;
-                    extra_memops += 2 * n * c * num_elements;
-                }
+
             }
             else if (sub_type == "conv1d")
             {
@@ -1809,9 +1645,9 @@ void Graph::flops_memops_sum()
                 int kh = op->inputs[1]->shape[2];
                 int kw = op->inputs[1]->shape[3];
                 int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
                 flops += 2 * n * c * h * w * kh * kw * o / g;
                 memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
             }
@@ -1826,9 +1662,9 @@ void Graph::flops_memops_sum()
                 int kh = op->inputs[1]->shape[3];
                 int kw = op->inputs[1]->shape[4];
                 int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
                 flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
                 memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
             }
@@ -1851,9 +1687,9 @@ void Graph::flops_memops_sum()
                 int kh = op->inputs[1]->shape[2];
                 int kw = op->inputs[1]->shape[3];
                 int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
                 flops += 2 * n * c * h * w * kh * kw * o / g;
                 memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
             }
@@ -1868,20 +1704,15 @@ void Graph::flops_memops_sum()
                 int kh = op->inputs[1]->shape[3];
                 int kw = op->inputs[1]->shape[4];
                 int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
+                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
+                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
+                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
                 flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
                 memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
             }
             else if (sub_type == "embedding")
             {
-                int n = op->inputs[0]->shape[0];
-                int l = op->inputs[0]->shape[1];
-                int c = op->params.at("num_embeddings").i;
-                int e = op->params.at("embedding_dim").i;
-                extra_flops += n * l * e;
-                extra_memops += n * l + n * e;
+                /*todo*/
             }
             else if (sub_type == "linear")
             {
@@ -1920,139 +1751,362 @@ void Graph::flops_memops_sum()
         else if (op->type.substr(0, 2) == "nn")
         {
             std::string sub_type = op->type.substr(3);
-            if (
-                sub_type == "BatchNorm1d" ||
-                sub_type == "BatchNorm2d" ||
-                sub_type == "BatchNorm3d"
-                )
-            {
+            if (sub_type == "BatchNorm1d"
+                || sub_type == "BatchNorm2d"
+                || sub_type == "BatchNorm3d"
+                || sub_type == "GroupNorm"
+                || sub_type == "LayerNorm"
+                || sub_type == "InstanceNorm1d"
+                || sub_type == "InstanceNorm2d"
+                || sub_type == "InstanceNorm3d")
+            {
+                std::vector<int> shape = op->inputs[0]->shape;
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
-                int num_elements = 1;
-                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+                if((op->has_param("affine") && op->params.at("affine").b)
+                    || (op->has_param("elementwise_affine") && op->params.at("elementwise_affine").b))
                 {
-                    num_elements *= op->inputs[0]->shape[i];
+                    extra_flops += 2 * num_elements;
+                    extra_memops += 2 * (num_elements + n * c);
+                }
+                else
+                {
+                    extra_flops += num_elements;
+                    extra_memops += num_elements;
+                }
+            }
+            else if (sub_type == "Conv1d"
+                    || sub_type == "Conv2d"
+                    || sub_type == "Conv3d"
+                    || sub_type == "ConvTranspose1d"
+                    || sub_type == "ConvTranspose2d"
+                    || sub_type == "ConvTranspose3d")
+            {
+                int c = op->params.at("in_channels").i;
+                std::vector<int> k = op->params.at("kernel_size").ai;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int g = op->params["groups"].i;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                int kernel_size = std::accumulate(k.begin() + 2, k.end(), 1, std::multiplies<int>());
+                flops += output_size * c * kernel_size / g; 
+                memops += input_size + output_size + std::accumulate(k.begin(), k.end(), 1, std::multiplies<int>()) * c / g;
+                if(op->has_param("bias"))
+                {
+                    flops += output_size;
+                    memops += output_size;
+                }
+            }
+            else if (sub_type == "AvgPool1d"
+                    || sub_type == "AvgPool2d"
+                    || sub_type == "AvgPool3d")
+            {
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                flops += input_size;
+                memops += input_size + output_size;
+            }
+            else if (sub_type == "AdaptiveAvgPool1d"
+                    || sub_type == "AdaptiveAvgPool2d"
+                    || sub_type == "AdaptiveAvgPool3d")
+            {
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                std::vector<int> kernel_size;
+                for(size_t i = 2; i < input_shape.size(); i++)
+                {
+                    kernel_size.emplace_back(output_shape[i] / input_shape[i]);
+                }
+                flops += (std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<int>()) + 1) * output_size;
+                memops += input_size + output_size;
+            }
+            else if(sub_type == "PReLU"
+                    || sub_type == "ELU"
+                    || sub_type == "LeakyReLU"
+                    || sub_type == "GELU")
+            {
+                std::vector<int> shape = op->outputs[0]->shape;
+                int n = shape[0];
+                int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+                extra_flops += num_elements;
+                if(sub_type == "PReLU")
+                {
+                    extra_memops += 2 * num_elements + n * op->params["num_parameters"].i;
+                }
+                else
+                {
+                    extra_memops += 2 * num_elements;
                 }
-                extra_flops += 7 * n * c * num_elements;
-                extra_memops += 2 * n * c * num_elements;
-            }
-            else if (sub_type == "Conv1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->inputs[1]->shape[0];
-                int o = op->outputs[0]->shape[2];
-                flops += 2 * n * c * l * k * o;
-                memops += 2 * n * c * l * k + n * o;
-            }
-            else if (sub_type == "Conv2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->inputs[1]->shape[2];
-                int kw = op->inputs[1]->shape[3];
-                int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
-                flops += 2 * n * c * h * w * kh * kw * o / g;
-                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
             }
-            else if (sub_type == "Conv3d")
+            else if(sub_type == "Tanh")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->inputs[1]->shape[2];
-                int kh = op->inputs[1]->shape[3];
-                int kw = op->inputs[1]->shape[4];
-                int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
-                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
-                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+                std::vector<int> shape = op->outputs[0]->shape;
+                int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+                extra_flops += 2 * num_elements;
+                extra_memops += 2 * num_elements;
             }
-            else if (sub_type == "ConvTranspose1d")
+            else if (sub_type == "Linear")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->inputs[1]->shape[0];
-                int o = op->outputs[0]->shape[2];
-                flops += 2 * n * c * l * k * o;
-                memops += 2 * n * c * l * k + n * o;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                int in_features = op->params.at("in_features").i;
+                int out_features = op->params.at("out_features").i;
+                int bias = op->has_param("bias") ? out_features : 0;
+                flops += (in_features * out_features + bias) * input_size / in_features;
+                memops += input_size + output_size + output_size * (bias ? 1 : 0);
             }
-            else if (sub_type == "ConvTranspose2d")
+            else if (sub_type == "Upsample"
+                    || sub_type == "UnsampleBilinear2d"
+                    || sub_type == "UnsampleNearest2d")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->inputs[1]->shape[2];
-                int kw = op->inputs[1]->shape[3];
-                int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
-                flops += 2 * n * c * h * w * kh * kw * o / g;
-                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                std::string mode;
+                if(sub_type == "Unsample")
+                {
+                    mode = op->has_param("mode") ? op->params.at("mode").s : "nearest";
+                }
+                else if(sub_type == "UnsampleBilinear2d")
+                {
+                    mode = "bilinear";
+                }
+                else if(sub_type == "UnsampleNearest2d")
+                {
+                    mode = "nearest";
+                }
+
+                if(mode == "nearest")
+                {
+                    extra_flops += input_size;
+                    extra_memops += input_size + output_size;
+                }
+                else if(mode == "linear")
+                {
+                    extra_flops += 5 * output_size;
+                    extra_memops += 2 * input_size + output_size;
+                }
+                else if(mode == "bilinear")
+                {
+                    extra_flops += 11 * output_size;
+                    extra_memops += 4 * input_size + output_size;
+                }
+                else if(mode == "bicubic")
+                {
+                    extra_flops += (224 + 35) * output_size;
+                    extra_memops += 16 * input_size + output_size;
+                }
+                else if(mode == "trilinear")
+                {
+                    extra_flops += (13 * 2 + 5) * input_size;
+                    extra_memops += 8 * input_size + output_size;
+                }
             }
-            else if (sub_type == "PReLU")
+            else if(sub_type == "RNN")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int num_elements = 1;
-                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                bool bi = op->has_param("bidirectional") && op->params.at("bidirectional").b;
+                bool bias = op->has_param("bias") && op->params.at("bias").b;
+                int input_size = op->params.at("input_size").i;
+                int hidden_size = op->params.at("hidden_size").i;
+                int flops1 = hidden_size * (input_size + hidden_size) + hidden_size;
+                if(bias)
+                {
+                    flops1 += 2 * hidden_size;
+                }
+                if(bi)
+                {
+                    flops1 *= 2;
+                }
+
+                int num_layers = op->params.at("num_layers").i;
+                int flops2 = 0;
+                if(bi)
+                {
+                    flops2 = 3 * hidden_size * hidden_size + hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 2 * hidden_size;
+                    }
+                    flops2 *= 2 * num_layers;
+                }
+                else
                 {
-                    num_elements *= op->inputs[0]->shape[i];
+                    flops2 = 2 * hidden_size * hidden_size + hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 2 * hidden_size;
+                    }
+                    flops2 *= num_layers;
+                }
+                bool batch_first = op->has_param("batch_first") && op->params.at("batch_first").b;
+                int batch_size = batch_first ? op->inputs[0]->shape[0] : op->inputs[0]->shape[1];
+                int num_steps = batch_first ? op->inputs[0]->shape[1] : op->inputs[0]->shape[0];
+                flops += (flops1 + flops2) * num_steps * batch_size;
+                memops += num_steps * batch_size * input_size;
+                memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
+                if(bias)
+                {
+                    memops += 2 * hidden_size * num_layers * (bi ? 2 : 1);
                 }
-                extra_flops += 2 * n * c * num_elements;
-                extra_memops += 2 * n * c * num_elements;
             }
-            else if (sub_type == "ConvTranspose3d")
+            else if(sub_type == "LSTM")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->inputs[1]->shape[2];
-                int kh = op->inputs[1]->shape[3];
-                int kw = op->inputs[1]->shape[4];
-                int o = op->outputs[0]->shape[2];
-                int s = op->params.at("stride").ai[0];
-                int p = op->params.at("padding").ai[0];
-                int g = op->params.at("groups").i;
-                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
-                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
+                bool bi = op->has_param("bidirectional") && op->params.at("bidirectional").b;
+                bool bias = op->has_param("bias") && op->params.at("bias").b;
+                int input_size = op->params.at("input_size").i;
+                int hidden_size = op->params.at("hidden_size").i;
+                int flops1 = 4 * hidden_size * (input_size + hidden_size) + 4 * hidden_size;
+                if(bias)
+                {
+                    flops1 += 8 * hidden_size;
+                }
+                if(bi)
+                {
+                    flops1 *= 2;
+                }
+                flops1 += 4 * hidden_size;
+
+                int num_layers = op->params.at("num_layers").i;
+                int flops2 = 0;
+                if(bi)
+                {
+                    flops2 = 12 * hidden_size * hidden_size + 4 * hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 8 * hidden_size;
+                    }
+                    flops2 += 4 * hidden_size;
+                    flops2 *= 2 * num_layers;
+                }
+                else
+                {
+                    flops2 = 4 * hidden_size * hidden_size + 4 * hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 8 * hidden_size;
+                    }
+                    flops2 += 4 * hidden_size;
+                    flops2 *= num_layers;
+                }
+                bool batch_first = op->has_param("batch_first") && op->params.at("batch_first").b;
+                int batch_size = batch_first ? op->inputs[0]->shape[0] : op->inputs[0]->shape[1];
+                int num_steps = batch_first ? op->inputs[0]->shape[1] : op->inputs[0]->shape[0];
+                flops += (flops1 + flops2) * num_steps * batch_size;
+                memops += num_steps * batch_size * input_size;
+                memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
+                if(bias)
+                {
+                    memops += 8 * hidden_size * num_layers * (bi ? 2 : 1);
+                }
             }
-            else if (sub_type == "Embedding")
+            else if (sub_type == "GRU")
             {
-                int n = op->inputs[0]->shape[0];
-                int l = op->inputs[0]->shape[1];
-                int c = op->params.at("num_embeddings").i;
-                int e = op->params.at("embedding_dim").i;
-                extra_flops += 2 * n * l * e;
-                extra_memops += 2 * n * l + n * e;
+                bool bi = op->has_param("bidirectional") && op->params.at("bidirectional").b;
+                bool bias = op->has_param("bias") && op->params.at("bias").b;
+                int input_size = op->params.at("input_size").i;
+                int hidden_size = op->params.at("hidden_size").i;
+                int flops1 = 3 * hidden_size * (input_size + hidden_size) + 3 * hidden_size;
+                if(bias)
+                {
+                    flops1 += 6 * hidden_size;
+                }
+                flops1 += 4 * hidden_size;
+                if(bi)
+                {
+                    flops1 *= 2;
+                }
+
+                int num_layers = op->params.at("num_layers").i;
+                int flops2 = 0;
+                if(bi)
+                {
+                    flops2 = 9 * hidden_size * hidden_size + 3 * hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 6 * hidden_size;
+                    }
+                    flops2 += 4 * hidden_size;
+                    flops2 *= 2 * num_layers;
+                }
+                else
+                {
+                    flops2 = 6 * hidden_size * hidden_size + 3 * hidden_size;
+                    if(bias)
+                    {
+                        flops2 += 6 * hidden_size;
+                    }
+                    flops2 += 4 * hidden_size;
+                    flops2 *= num_layers;
+                }
+                bool batch_first = op->has_param("batch_first") && op->params.at("batch_first").b;
+                int batch_size = batch_first ? op->inputs[0]->shape[0] : op->inputs[0]->shape[1];
+                int num_steps = batch_first ? op->inputs[0]->shape[1] : op->inputs[0]->shape[0];
+                flops += (flops1 + flops2) * num_steps * batch_size;
+                memops += num_steps * batch_size * input_size;
+                memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
+                if(bias)
+                {
+                    memops += 6 * hidden_size * num_layers * (bi ? 2 : 1);
+                }
             }
-            else if (sub_type == "GroupNorm" || sub_type == "InstanceNorm" || sub_type == "LayerNorm")
+            else if(sub_type == "MultiheadAttention")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int num_elements = 1;
-                for (size_t i = 2; i < op->inputs[0]->shape.size(); ++i)
+                bool batch_first = op->has_param("batch_first") && op->params.at("batch_first").b;
+                int batch_size = batch_first ? op->inputs[0]->shape[0] : op->inputs[0]->shape[1];
+                int qlen = batch_first ? op->inputs[0]->shape[1] : op->inputs[0]->shape[0];
+                int klen = batch_first ? op->inputs[1]->shape[1] : op->inputs[1]->shape[0];
+                int d_model = op->params.at("embed_dim").i;
+                int num_heads = op->params.at("num_heads").i;
+                int head_dim = d_model / num_heads;
+                bool bias = op->params.at("bias").b;
+
+                // Linear transformations for Q, K, V
+                int flops_qkv = 3 * batch_size * qlen * d_model * d_model;
+                if (bias)
+                {
+                    flops_qkv += 3 * batch_size * qlen * d_model;
+                }
+
+                // Scaled dot-product attention
+                int flops_attention = batch_size * num_heads * qlen * klen * head_dim;
+
+                // Linear transformation for output
+                int flops_output = batch_size * qlen * d_model * d_model;
+                if (bias)
+                {
+                    flops_output += batch_size * qlen * d_model;
+                }
+
+                flops += flops_qkv + flops_attention + flops_output;
+
+                // Memory operations for Q, K, V
+                int memops_qkv = 3 * batch_size * qlen * d_model;
+                if (bias)
+                {
+                    memops_qkv += 3 * d_model;
+                }
+
+                // Memory operations for attention weights
+                int memops_attention = batch_size * num_heads * qlen * klen;
+
+                // Memory operations for output
+                int memops_output = batch_size * qlen * d_model;
+                if (bias)
                 {
-                    num_elements *= op->inputs[0]->shape[i];
+                    memops_output += d_model;
                 }
 
-                extra_flops += 7 * n * c * num_elements;
-                extra_memops += 2 * n * c * num_elements;
+                // Total memory operations
+                memops += memops_qkv + memops_attention + memops_output;
             }
         }
     }

From 4adf254c729edc63ef0e7deb86b3bbb3cbf079c9 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Thu, 19 Sep 2024 16:02:54 +0800
Subject: [PATCH 14/16] functional finished

---
 tools/pnnx/src/ir.cpp | 335 +++++++-----------------------------------
 1 file changed, 53 insertions(+), 282 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 6cbf320acf41..f05ca65a3c59 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1454,297 +1454,68 @@ void Graph::flops_memops_sum()
         if (op->type[0] == 'F')
         {
             std::string sub_type = op->type.substr(2);
-            if (sub_type == "adaptive_avg_pool1d")
+            if (sub_type == "linear")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int o = op->params.at("output_size").ai[0];
-                flops += n * c * l * o;
-                memops += n * c * l + n * c * o;
-            }
-            else if (sub_type == "adaptive_avg_pool2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int oh = op->params.at("output_size").ai[0];
-                int ow = op->params.at("output_size").ai[1];
-                flops += n * c * h * w * oh * ow;
-                memops += n * c * h * w + n * c * oh * ow;
-            }
-            else if (sub_type == "adaptive_avg_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int od = op->params.at("output_size").ai[0];
-                int oh = op->params.at("output_size").ai[1];
-                int ow = op->params.at("output_size").ai[2];
-                flops += n * c * d * h * w * od * oh * ow;
-                memops += n * c * d * h * w + n * c * od * oh * ow;
-            }
-            else if (sub_type == "avg_pool1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int o = (l + 2 * p - k) / s + 1;
-                flops += n * c * l * k;
-                memops += n * c * l + n * c * o;
-            }
-            else if (sub_type == "avg_pool2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
-                int sh = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int sw = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
-                int ph = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int pw = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
-                int oh = (h + 2 * ph - kh) / sh + 1;
-                int ow = (w + 2 * pw - kw) / sw + 1;
-                flops += n * c * h * w * kh * kw;
-                memops += n * c * h * w + n * c * oh * ow;
-            }
-            else if (sub_type == "avg_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
-                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[2] : 1;
-                int sd = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int sh = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
-                int sw = op->has_param("stride") ? op->params.at("stride").ai[2] : 1;
-                int pd = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int ph = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
-                int pw = op->has_param("padding") ? op->params.at("padding").ai[2] : 0;
-                int od = (d + 2 * pd - kd) / sd + 1;
-                int oh = (h + 2 * ph - kh) / sh + 1;
-                int ow = (w + 2 * pw - kw) / sw + 1;
-                flops += n * c * d * h * w * kd * kh * kw;
-                memops += n * c * d * h * w + n * c * od * oh * ow;
-            }
-            else if (sub_type == "adaptive_max_pool1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int o = op->params.at("output_size").ai[0];
-                flops += n * c * l * o;
-                memops += n * c * l + n * c * o;
-            }
-            else if (sub_type == "adaptive_max_pool2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int oh = op->params.at("output_size").ai[0];
-                int ow = op->params.at("output_size").ai[1];
-                flops += n * c * h * w * oh * ow;
-                memops += n * c * h * w + n * c * oh * ow;
-            }
-            else if (sub_type == "adaptive_max_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int od = op->params.at("output_size").ai[0];
-                int oh = op->params.at("output_size").ai[1];
-                int ow = op->params.at("output_size").ai[2];
-                flops += n * c * d * h * w * od * oh * ow;
-                memops += n * c * d * h * w + n * c * od * oh * ow;
-            }
-            else if (sub_type == "max_pool1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int o = (l + 2 * p - k) / s + 1;
-                flops += n * c * l * k;
-                memops += n * c * l + n * c * o;
-            }
-            else if (sub_type == "max_pool2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
-                int sh = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int sw = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
-                int ph = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int pw = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
-                int oh = (h + 2 * ph - kh) / sh + 1;
-                int ow = (w + 2 * pw - kw) / sw + 1;
-                flops += n * c * h * w * kh * kw;
-                memops += n * c * h * w + n * c * oh * ow;
-            }
-            else if (sub_type == "max_pool3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[0] : 1;
-                int kh = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[1] : 1;
-                int kw = op->has_param("kernel_size") ? op->params.at("kernel_size").ai[2] : 1;
-                int sd = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int sh = op->has_param("stride") ? op->params.at("stride").ai[1] : 1;
-                int sw = op->has_param("stride") ? op->params.at("stride").ai[2] : 1;
-                int pd = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int ph = op->has_param("padding") ? op->params.at("padding").ai[1] : 0;
-                int pw = op->has_param("padding") ? op->params.at("padding").ai[2] : 0;
-                int od = (d + 2 * pd - kd) / sd + 1;
-                int oh = (h + 2 * ph - kh) / sh + 1;
-                int ow = (w + 2 * pw - kw) / sw + 1;
-                flops += n * c * d * h * w * kd * kh * kw;
-                memops += n * c * d * h * w + n * c * od * oh * ow;
-            }
-            else if (sub_type == "prelu" || sub_type == "leaky_relu")
-            {
-
-            }
-            else if (sub_type == "conv1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->inputs[1]->shape[0];
-                int o = op->outputs[0]->shape[2];
-                flops += 2 * n * c * l * k * o;
-                memops += 2 * n * c * l * k + n * o;
-            }
-            else if (sub_type == "conv2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->inputs[1]->shape[2];
-                int kw = op->inputs[1]->shape[3];
-                int o = op->outputs[0]->shape[2];
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
-                flops += 2 * n * c * h * w * kh * kw * o / g;
-                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
-            }
-            else if (sub_type == "conv3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->inputs[1]->shape[2];
-                int kh = op->inputs[1]->shape[3];
-                int kw = op->inputs[1]->shape[4];
-                int o = op->outputs[0]->shape[2];
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
-                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
-                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
-            }
-            else if (sub_type == "conv_transpose1d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                int k = op->inputs[1]->shape[0];
-                int o = op->outputs[0]->shape[2];
-                flops += 2 * n * c * l * k * o;
-                memops += 2 * n * c * l * k + n * o;
-            }
-            else if (sub_type == "conv_transpose2d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int h = op->inputs[0]->shape[2];
-                int w = op->inputs[0]->shape[3];
-                int kh = op->inputs[1]->shape[2];
-                int kw = op->inputs[1]->shape[3];
-                int o = op->outputs[0]->shape[2];
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
-                flops += 2 * n * c * h * w * kh * kw * o / g;
-                memops += 2 * n * c * h * w * kh * kw / g + n * o * h * w;
-            }
-            else if (sub_type == "conv_transpose3d")
-            {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                int h = op->inputs[0]->shape[3];
-                int w = op->inputs[0]->shape[4];
-                int kd = op->inputs[1]->shape[2];
-                int kh = op->inputs[1]->shape[3];
-                int kw = op->inputs[1]->shape[4];
-                int o = op->outputs[0]->shape[2];
-                int s = op->has_param("stride") ? op->params.at("stride").ai[0] : 1;
-                int p = op->has_param("padding") ? op->params.at("padding").ai[0] : 0;
-                int g = op->has_param("groups") ? op->params.at("groups").i : 1;
-                flops += 2 * n * c * d * h * w * kd * kh * kw * o / g;
-                memops += 2 * n * c * d * h * w * kd * kh * kw / g + n * o * d * h * w;
-            }
-            else if (sub_type == "embedding")
-            {
-                /*todo*/
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                int out_features = op->attrs.at("data").shape[0];
+                flops += input_size * out_features;
+                if(op->has_param("bias"))
+                {
+                    flops += out_features;
+                }
+                memops += input_size + output_size;
             }
-            else if (sub_type == "linear")
+            else if (sub_type == "avgpool1d"
+                    || sub_type == "avgpool2d"
+                    || sub_type == "avgpool3d"
+                    || sub_type == "adaptive_avgpool1d"
+                    || sub_type == "adaptive_avgpool2d"
+                    || sub_type == "adaptive_avgpool3d")
             {
-                int n = op->inputs[0]->shape[0];
-                int i = op->inputs[0]->shape[1];
-                int o = op->outputs[0]->shape[1];
-                flops += 2 * n * i * o;
-                memops += 2 * n * i + n * o;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                flops += input_size;
+                memops += input_size + output_size;
             }
-            else if (sub_type == "log_softmax")
+            else if (sub_type == "prelu"
+                    || sub_type == "elu"
+                    || sub_type == "leaky_relu"
+                    || sub_type == "gelu"
+                    || sub_type == "silu"
+                    || sub_type == "softmax")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                extra_flops += 2 * n * c * l;
-                extra_memops += 2 * n * c * l;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                extra_flops += input_size;
+                extra_memops += input_size + output_size;
             }
-            else if (sub_type == "logsigmoid")
+            else if (sub_type == "unsample"
+                    || sub_type == "upsample_nearest"
+                    || sub_type == "upsample_bilinear")
             {
-                int n = op->inputs[0]->shape[0];
-                int c = op->inputs[0]->shape[1];
-                int l = op->inputs[0]->shape[2];
-                extra_flops += 2 * n * c * l;
-                extra_memops += 2 * n * c * l;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                extra_flops += output_size;
+                extra_memops += input_size + output_size;
             }
-            else if (sub_type == "scaled_dot_product_attention")
+            else if (sub_type == "interpolate")
             {
-                int n = op->inputs[0]->shape[0];
-                int l = op->inputs[0]->shape[1];
-                int d = op->inputs[0]->shape[2];
-                flops += 2 * n * l * l + n * l * d + n * l * l * d;
-                memops += 2 * n * l * d + 3 * n * l * l + n * l;
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                std::vector<int> scale_factor = op->params.at("scale_factor").ai;
+                extra_flops += input_size * std::accumulate(scale_factor.begin(), scale_factor.end(), 1, std::multiplies<int>());
+                extra_memops += input_size + output_size;
             }
         }
 

From 296954dee81b2b2bc8c1b87906c121031d538191 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Fri, 20 Sep 2024 12:56:07 +0800
Subject: [PATCH 15/16] all finished

---
 tools/pnnx/src/ir.cpp | 45 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index f05ca65a3c59..e4974f6b7c87 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1450,7 +1450,6 @@ void Graph::flops_memops_sum()
 {
     for (auto op : ops)
     {
-        fprintf(stderr, "op->type: %s\n", op->type.c_str());
         if (op->type[0] == 'F')
         {
             std::string sub_type = op->type.substr(2);
@@ -1880,6 +1879,50 @@ void Graph::flops_memops_sum()
                 memops += memops_qkv + memops_attention + memops_output;
             }
         }
+
+        else if (op->type.substr(0, 5) == "torch")
+        {
+            std::string sub_type = op->type.substr(6);
+            if(sub_type == "matmul"
+                || sub_type == "mm"
+                || sub_type == "bmm")
+            {
+                std::vector<int> input_shape_1 = op->inputs[0]->shape;
+                std::vector<int> input_shape_2 = op->inputs[1]->shape;
+                int input_size_1 = std::accumulate(input_shape_1.begin(), input_shape_1.end(), 1, std::multiplies<int>());
+                int input_size_2 = std::accumulate(input_shape_2.begin(), input_shape_2.end(), 1, std::multiplies<int>());
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                flops += input_size_1 * input_shape_2.back();
+                memops += input_size_1 + input_size_2 + output_size;
+            }
+            else if (sub_type == "addmm"
+                    || sub_type == "baddbmm")
+            {
+                std::vector<int> input_shape = op->inputs[0]->shape;
+                std::vector<int> mat_shape_1 = op->inputs[1]->shape;
+                std::vector<int> mat_shape_2 = op->inputs[2]->shape;
+                int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+                int mat_size_1 = std::accumulate(mat_shape_1.begin(), mat_shape_1.end(), 1, std::multiplies<int>());
+                int mat_size_2 = std::accumulate(mat_shape_2.begin(), mat_shape_2.end(), 1, std::multiplies<int>());
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                flops += input_size + mat_size_1 * mat_shape_2.back();
+                memops += input_size + mat_size_1 + mat_size_2 + output_size;
+            }
+            else if (sub_type == "mul"
+                    || sub_type == "add")
+            {
+                std::vector<int> input_shape_1 = op->inputs[0]->shape;
+                std::vector<int> input_shape_2 = op->inputs[1]->shape;
+                int input_size_1 = std::accumulate(input_shape_1.begin(), input_shape_1.end(), 1, std::multiplies<int>());
+                int input_size_2 = std::accumulate(input_shape_2.begin(), input_shape_2.end(), 1, std::multiplies<int>());
+                std::vector<int> output_shape = op->outputs[0]->shape;
+                int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+                flops += output_size;
+                memops += input_size_1 + input_size_2 + output_size;
+            }
+        }
     }
 }
 

From 5ff6210585d6d8d48a7050245c979501b2829b49 Mon Sep 17 00:00:00 2001
From: SZUwishion <2559916473@qq.com>
Date: Tue, 15 Oct 2024 23:09:30 +0800
Subject: [PATCH 16/16] code format fix

---
 tools/pnnx/src/ir.cpp   | 132 ++++++++++++++++++++--------------------
 tools/pnnx/src/main.cpp |   2 +-
 2 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index e4974f6b7c87..c81944c12052 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1461,18 +1461,18 @@ void Graph::flops_memops_sum()
                 int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
                 int out_features = op->attrs.at("data").shape[0];
                 flops += input_size * out_features;
-                if(op->has_param("bias"))
+                if (op->has_param("bias"))
                 {
                     flops += out_features;
                 }
                 memops += input_size + output_size;
             }
             else if (sub_type == "avgpool1d"
-                    || sub_type == "avgpool2d"
-                    || sub_type == "avgpool3d"
-                    || sub_type == "adaptive_avgpool1d"
-                    || sub_type == "adaptive_avgpool2d"
-                    || sub_type == "adaptive_avgpool3d")
+                     || sub_type == "avgpool2d"
+                     || sub_type == "avgpool3d"
+                     || sub_type == "adaptive_avgpool1d"
+                     || sub_type == "adaptive_avgpool2d"
+                     || sub_type == "adaptive_avgpool3d")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> output_shape = op->outputs[0]->shape;
@@ -1482,11 +1482,11 @@ void Graph::flops_memops_sum()
                 memops += input_size + output_size;
             }
             else if (sub_type == "prelu"
-                    || sub_type == "elu"
-                    || sub_type == "leaky_relu"
-                    || sub_type == "gelu"
-                    || sub_type == "silu"
-                    || sub_type == "softmax")
+                     || sub_type == "elu"
+                     || sub_type == "leaky_relu"
+                     || sub_type == "gelu"
+                     || sub_type == "silu"
+                     || sub_type == "softmax")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> output_shape = op->outputs[0]->shape;
@@ -1496,8 +1496,8 @@ void Graph::flops_memops_sum()
                 extra_memops += input_size + output_size;
             }
             else if (sub_type == "unsample"
-                    || sub_type == "upsample_nearest"
-                    || sub_type == "upsample_bilinear")
+                     || sub_type == "upsample_nearest"
+                     || sub_type == "upsample_bilinear")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> output_shape = op->outputs[0]->shape;
@@ -1534,7 +1534,7 @@ void Graph::flops_memops_sum()
                 int n = op->inputs[0]->shape[0];
                 int c = op->inputs[0]->shape[1];
                 int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-                if((op->has_param("affine") && op->params.at("affine").b)
+                if ((op->has_param("affine") && op->params.at("affine").b)
                     || (op->has_param("elementwise_affine") && op->params.at("elementwise_affine").b))
                 {
                     extra_flops += 2 * num_elements;
@@ -1547,11 +1547,11 @@ void Graph::flops_memops_sum()
                 }
             }
             else if (sub_type == "Conv1d"
-                    || sub_type == "Conv2d"
-                    || sub_type == "Conv3d"
-                    || sub_type == "ConvTranspose1d"
-                    || sub_type == "ConvTranspose2d"
-                    || sub_type == "ConvTranspose3d")
+                     || sub_type == "Conv2d"
+                     || sub_type == "Conv3d"
+                     || sub_type == "ConvTranspose1d"
+                     || sub_type == "ConvTranspose2d"
+                     || sub_type == "ConvTranspose3d")
             {
                 int c = op->params.at("in_channels").i;
                 std::vector<int> k = op->params.at("kernel_size").ai;
@@ -1561,17 +1561,17 @@ void Graph::flops_memops_sum()
                 int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
                 int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
                 int kernel_size = std::accumulate(k.begin() + 2, k.end(), 1, std::multiplies<int>());
-                flops += output_size * c * kernel_size / g; 
+                flops += output_size * c * kernel_size / g;
                 memops += input_size + output_size + std::accumulate(k.begin(), k.end(), 1, std::multiplies<int>()) * c / g;
-                if(op->has_param("bias"))
+                if (op->has_param("bias"))
                 {
                     flops += output_size;
                     memops += output_size;
                 }
             }
             else if (sub_type == "AvgPool1d"
-                    || sub_type == "AvgPool2d"
-                    || sub_type == "AvgPool3d")
+                     || sub_type == "AvgPool2d"
+                     || sub_type == "AvgPool3d")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> output_shape = op->outputs[0]->shape;
@@ -1581,31 +1581,31 @@ void Graph::flops_memops_sum()
                 memops += input_size + output_size;
             }
             else if (sub_type == "AdaptiveAvgPool1d"
-                    || sub_type == "AdaptiveAvgPool2d"
-                    || sub_type == "AdaptiveAvgPool3d")
+                     || sub_type == "AdaptiveAvgPool2d"
+                     || sub_type == "AdaptiveAvgPool3d")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> output_shape = op->outputs[0]->shape;
                 int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
                 int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
                 std::vector<int> kernel_size;
-                for(size_t i = 2; i < input_shape.size(); i++)
+                for (size_t i = 2; i < input_shape.size(); i++)
                 {
                     kernel_size.emplace_back(output_shape[i] / input_shape[i]);
                 }
                 flops += (std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<int>()) + 1) * output_size;
                 memops += input_size + output_size;
             }
-            else if(sub_type == "PReLU"
-                    || sub_type == "ELU"
-                    || sub_type == "LeakyReLU"
-                    || sub_type == "GELU")
+            else if (sub_type == "PReLU"
+                     || sub_type == "ELU"
+                     || sub_type == "LeakyReLU"
+                     || sub_type == "GELU")
             {
                 std::vector<int> shape = op->outputs[0]->shape;
                 int n = shape[0];
                 int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
                 extra_flops += num_elements;
-                if(sub_type == "PReLU")
+                if (sub_type == "PReLU")
                 {
                     extra_memops += 2 * num_elements + n * op->params["num_parameters"].i;
                 }
@@ -1614,7 +1614,7 @@ void Graph::flops_memops_sum()
                     extra_memops += 2 * num_elements;
                 }
             }
-            else if(sub_type == "Tanh")
+            else if (sub_type == "Tanh")
             {
                 std::vector<int> shape = op->outputs[0]->shape;
                 int num_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
@@ -1634,75 +1634,75 @@ void Graph::flops_memops_sum()
                 memops += input_size + output_size + output_size * (bias ? 1 : 0);
             }
             else if (sub_type == "Upsample"
-                    || sub_type == "UnsampleBilinear2d"
-                    || sub_type == "UnsampleNearest2d")
+                     || sub_type == "UnsampleBilinear2d"
+                     || sub_type == "UnsampleNearest2d")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 int input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
                 std::vector<int> output_shape = op->outputs[0]->shape;
                 int output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
                 std::string mode;
-                if(sub_type == "Unsample")
+                if (sub_type == "Unsample")
                 {
                     mode = op->has_param("mode") ? op->params.at("mode").s : "nearest";
                 }
-                else if(sub_type == "UnsampleBilinear2d")
+                else if (sub_type == "UnsampleBilinear2d")
                 {
                     mode = "bilinear";
                 }
-                else if(sub_type == "UnsampleNearest2d")
+                else if (sub_type == "UnsampleNearest2d")
                 {
                     mode = "nearest";
                 }
 
-                if(mode == "nearest")
+                if (mode == "nearest")
                 {
                     extra_flops += input_size;
                     extra_memops += input_size + output_size;
                 }
-                else if(mode == "linear")
+                else if (mode == "linear")
                 {
                     extra_flops += 5 * output_size;
                     extra_memops += 2 * input_size + output_size;
                 }
-                else if(mode == "bilinear")
+                else if (mode == "bilinear")
                 {
                     extra_flops += 11 * output_size;
                     extra_memops += 4 * input_size + output_size;
                 }
-                else if(mode == "bicubic")
+                else if (mode == "bicubic")
                 {
                     extra_flops += (224 + 35) * output_size;
                     extra_memops += 16 * input_size + output_size;
                 }
-                else if(mode == "trilinear")
+                else if (mode == "trilinear")
                 {
                     extra_flops += (13 * 2 + 5) * input_size;
                     extra_memops += 8 * input_size + output_size;
                 }
             }
-            else if(sub_type == "RNN")
+            else if (sub_type == "RNN")
             {
                 bool bi = op->has_param("bidirectional") && op->params.at("bidirectional").b;
                 bool bias = op->has_param("bias") && op->params.at("bias").b;
                 int input_size = op->params.at("input_size").i;
                 int hidden_size = op->params.at("hidden_size").i;
                 int flops1 = hidden_size * (input_size + hidden_size) + hidden_size;
-                if(bias)
+                if (bias)
                 {
                     flops1 += 2 * hidden_size;
                 }
-                if(bi)
+                if (bi)
                 {
                     flops1 *= 2;
                 }
 
                 int num_layers = op->params.at("num_layers").i;
                 int flops2 = 0;
-                if(bi)
+                if (bi)
                 {
                     flops2 = 3 * hidden_size * hidden_size + hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 2 * hidden_size;
                     }
@@ -1711,7 +1711,7 @@ void Graph::flops_memops_sum()
                 else
                 {
                     flops2 = 2 * hidden_size * hidden_size + hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 2 * hidden_size;
                     }
@@ -1723,23 +1723,23 @@ void Graph::flops_memops_sum()
                 flops += (flops1 + flops2) * num_steps * batch_size;
                 memops += num_steps * batch_size * input_size;
                 memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
-                if(bias)
+                if (bias)
                 {
                     memops += 2 * hidden_size * num_layers * (bi ? 2 : 1);
                 }
             }
-            else if(sub_type == "LSTM")
+            else if (sub_type == "LSTM")
             {
                 bool bi = op->has_param("bidirectional") && op->params.at("bidirectional").b;
                 bool bias = op->has_param("bias") && op->params.at("bias").b;
                 int input_size = op->params.at("input_size").i;
                 int hidden_size = op->params.at("hidden_size").i;
                 int flops1 = 4 * hidden_size * (input_size + hidden_size) + 4 * hidden_size;
-                if(bias)
+                if (bias)
                 {
                     flops1 += 8 * hidden_size;
                 }
-                if(bi)
+                if (bi)
                 {
                     flops1 *= 2;
                 }
@@ -1747,10 +1747,10 @@ void Graph::flops_memops_sum()
 
                 int num_layers = op->params.at("num_layers").i;
                 int flops2 = 0;
-                if(bi)
+                if (bi)
                 {
                     flops2 = 12 * hidden_size * hidden_size + 4 * hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 8 * hidden_size;
                     }
@@ -1760,7 +1760,7 @@ void Graph::flops_memops_sum()
                 else
                 {
                     flops2 = 4 * hidden_size * hidden_size + 4 * hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 8 * hidden_size;
                     }
@@ -1773,7 +1773,7 @@ void Graph::flops_memops_sum()
                 flops += (flops1 + flops2) * num_steps * batch_size;
                 memops += num_steps * batch_size * input_size;
                 memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
-                if(bias)
+                if (bias)
                 {
                     memops += 8 * hidden_size * num_layers * (bi ? 2 : 1);
                 }
@@ -1785,22 +1785,22 @@ void Graph::flops_memops_sum()
                 int input_size = op->params.at("input_size").i;
                 int hidden_size = op->params.at("hidden_size").i;
                 int flops1 = 3 * hidden_size * (input_size + hidden_size) + 3 * hidden_size;
-                if(bias)
+                if (bias)
                 {
                     flops1 += 6 * hidden_size;
                 }
                 flops1 += 4 * hidden_size;
-                if(bi)
+                if (bi)
                 {
                     flops1 *= 2;
                 }
 
                 int num_layers = op->params.at("num_layers").i;
                 int flops2 = 0;
-                if(bi)
+                if (bi)
                 {
                     flops2 = 9 * hidden_size * hidden_size + 3 * hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 6 * hidden_size;
                     }
@@ -1810,7 +1810,7 @@ void Graph::flops_memops_sum()
                 else
                 {
                     flops2 = 6 * hidden_size * hidden_size + 3 * hidden_size;
-                    if(bias)
+                    if (bias)
                     {
                         flops2 += 6 * hidden_size;
                     }
@@ -1823,12 +1823,12 @@ void Graph::flops_memops_sum()
                 flops += (flops1 + flops2) * num_steps * batch_size;
                 memops += num_steps * batch_size * input_size;
                 memops += 2 * num_steps * batch_size * hidden_size * num_layers * (bi ? 2 : 1);
-                if(bias)
+                if (bias)
                 {
                     memops += 6 * hidden_size * num_layers * (bi ? 2 : 1);
                 }
             }
-            else if(sub_type == "MultiheadAttention")
+            else if (sub_type == "MultiheadAttention")
             {
                 bool batch_first = op->has_param("batch_first") && op->params.at("batch_first").b;
                 int batch_size = batch_first ? op->inputs[0]->shape[0] : op->inputs[0]->shape[1];
@@ -1883,7 +1883,7 @@ void Graph::flops_memops_sum()
         else if (op->type.substr(0, 5) == "torch")
         {
             std::string sub_type = op->type.substr(6);
-            if(sub_type == "matmul"
+            if (sub_type == "matmul"
                 || sub_type == "mm"
                 || sub_type == "bmm")
             {
@@ -1897,7 +1897,7 @@ void Graph::flops_memops_sum()
                 memops += input_size_1 + input_size_2 + output_size;
             }
             else if (sub_type == "addmm"
-                    || sub_type == "baddbmm")
+                     || sub_type == "baddbmm")
             {
                 std::vector<int> input_shape = op->inputs[0]->shape;
                 std::vector<int> mat_shape_1 = op->inputs[1]->shape;
@@ -1911,7 +1911,7 @@ void Graph::flops_memops_sum()
                 memops += input_size + mat_size_1 + mat_size_2 + output_size;
             }
             else if (sub_type == "mul"
-                    || sub_type == "add")
+                     || sub_type == "add")
             {
                 std::vector<int> input_shape_1 = op->inputs[0]->shape;
                 std::vector<int> input_shape_2 = op->inputs[1]->shape;
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 949680faab82..a50ca679fbc6 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -362,7 +362,7 @@ int main(int argc, char** argv)
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
-    
+
     pnnx_graph.flops_memops_sum();
     fprintf(stderr, "float ops = %.3fM\n", double(pnnx_graph.flops) / 1e6);
     fprintf(stderr, "mem ops = %.3fM\n", double(pnnx_graph.memops) / 1e6);