From 810f734215cff4a974460affd8368ae484987b48 Mon Sep 17 00:00:00 2001
From: Alexandre Eichenberger <alexe@us.ibm.com>
Date: Mon, 28 Aug 2023 03:51:13 -0400
Subject: [PATCH] Opt reporting: add signatures to stats (#2464)

* Opt reporting: add signatures to stats

Signed-off-by: Alexandre Eichenberger <alexe@us.ibm.com>

---------

Signed-off-by: Alexandre Eichenberger <alexe@us.ibm.com>
Co-authored-by: Tung D. Le <tung@jp.ibm.com>
---
 include/onnx-mlir/Runtime/OMTensor.h          |  11 +-
 .../ONNXToKrnl/Math/Elementwise.cpp           |   4 +-
 .../ONNXToKrnl/ONNXToKrnlCommon.cpp           |  58 +--
 .../ONNXToKrnl/Tensor/PrintSignature.cpp      |  27 +-
 src/Dialect/ONNX/ONNXOps/OpHelper.cpp         |  54 +++
 src/Dialect/ONNX/ONNXOps/OpHelper.hpp         |   6 +
 src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp      |   1 +
 src/Runtime/OMInstrument.inc                  |  38 +-
 src/Runtime/OMTensor.inc                      | 347 ++++++++++--------
 .../ONNX/InstrumentONNXSignaturePass.cpp      |  27 +-
 utils/make-report.py                          | 133 +++++--
 11 files changed, 427 insertions(+), 279 deletions(-)

diff --git a/include/onnx-mlir/Runtime/OMTensor.h b/include/onnx-mlir/Runtime/OMTensor.h
index 712a5d8ddf..f25e25c523 100644
--- a/include/onnx-mlir/Runtime/OMTensor.h
+++ b/include/onnx-mlir/Runtime/OMTensor.h
@@ -195,7 +195,8 @@ OM_EXTERNAL_VISIBILITY const int64_t *omTensorGetShape(const OMTensor *tensor);
  *
  * Set the data shape array of the OMTensor to the values in the input array.
  */
-OM_EXTERNAL_VISIBILITY void omTensorSetShape(OMTensor *tensor, const int64_t *shape);
+OM_EXTERNAL_VISIBILITY void omTensorSetShape(
+    OMTensor *tensor, const int64_t *shape);
 
 /**
  * \brief OMTensor data strides getter
@@ -209,7 +210,8 @@ OM_EXTERNAL_VISIBILITY void omTensorSetShape(OMTensor *tensor, const int64_t *sh
  * @param tensor pointer to the OMTensor
  * @return pointer to the data strides array.
  */
-OM_EXTERNAL_VISIBILITY const int64_t *omTensorGetStrides(const OMTensor *tensor);
+OM_EXTERNAL_VISIBILITY const int64_t *omTensorGetStrides(
+    const OMTensor *tensor);
 
 /**
  * \brief OMTensor data strides setter
@@ -320,7 +322,10 @@ OM_EXTERNAL_VISIBILITY void omTensorSetOwning(OMTensor *tensor, int64_t owning);
 /**
  * Print an OMTensor to stdout.
  *
- * @param msg, pointer to descriptive string
+ * @param msg, pointer to descriptive string. It accepts one of 3 formats:
+ * '%t' for printing the tensor's type, '%s' for printing an extensive signature
+ * printout, and '%d' for printing the full data values of the tensor.
+ * Additionally it recognize '%e' as the end of the message string.
  * @param tensor, pointer to the OMTensor to print
  */
 OM_EXTERNAL_VISIBILITY void omTensorPrint(
diff --git a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
index ffe68c60a8..fa656219ce 100644
--- a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@@ -2094,7 +2094,7 @@ struct ONNXElementwiseBinaryOpLowering
               shapeHelper, create, op, outputMemRefType,
               collapsedInnermostLoops, estimatedSimdLoopTripCount);
       if (uVL > 0) {
-        if (collapsedInnermostLoops == outputRank)
+        if (collapsedInnermostLoops == (int64_t)outputRank)
           onnxToKrnlSimdReport(op, /*successful*/ true, uVL,
               estimatedSimdLoopTripCount, "binary fully flattened");
         else
@@ -2257,7 +2257,7 @@ struct ONNXElementwiseVariadicOpLowering
               shapeHelper, create, op, outputMemRefType,
               collapsedInnermostLoops, estimatedSimdLoopTripCount);
       if (uVL > 0) {
-        if (collapsedInnermostLoops == outputRank)
+        if (collapsedInnermostLoops == (int64_t)outputRank)
           onnxToKrnlSimdReport(op, /*successful*/ true, uVL,
               estimatedSimdLoopTripCount, "variadic fully flattened");
         else
diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
index 01e636786c..a57df06990 100644
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
@@ -16,7 +16,6 @@
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "llvm/Support/Path.h"
 
 #include "src/Accelerators/Accelerator.hpp"
 #include "src/Dialect/Krnl/DialectBuilder.hpp"
@@ -664,66 +663,13 @@ bool hasNonIdentityLayout(ValueRange operands) {
 // Support functions for reporting.
 //===----------------------------------------------------------------------===//
 
-// We may try to relate the node names generated by the instrumentation
-// with the node names printed by opt-report. Thus it is key to keep the
-// code that generates these node name in sync.
-//
-// The code are found here:
-// 1) `matchAndRewrite` from `src/Conversion/KrnlToLLVM/KrnlInstrument.cpp`
-// 2) `getNodeNameLikeInKrnlInstrument` from
-//    `src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp`
-
-static std::string getNodeNameLikeInKrnlInstrument(Operation *op) {
-  StringAttr nodeName;
-  // Try with op onnx_node_name attribute.
-  nodeName = op->getAttrOfType<StringAttr>("onnx_node_name");
-  if (nodeName) {
-    LLVM_DEBUG(llvm::dbgs() << "op has node name\n");
-    return nodeName.str();
-  }
-  // Try with op location.
-  Location loc = op->getLoc();
-  if (auto nameLoc = loc.dyn_cast<NameLoc>()) {
-    LLVM_DEBUG(llvm::dbgs() << "op has node name\n");
-    return nameLoc.getName().str();
-  }
-  if (auto fusedLoc = loc.dyn_cast<FusedLoc>()) {
-    // Combine each location name and set it as nodeName.
-    LLVM_DEBUG(llvm::dbgs() << "fuse loc has name\n");
-    std::string name;
-    for (Location locIt : fusedLoc.getLocations()) {
-      if (auto nameLocIt = locIt.dyn_cast<NameLoc>())
-        name += nameLocIt.getName().str() + "-";
-      else if (auto fileLineColLoc = locIt.dyn_cast<FileLineColLoc>()) {
-        std::string filename =
-            llvm::sys::path::filename(fileLineColLoc.getFilename().str()).str();
-        name += filename + ":" + std::to_string(fileLineColLoc.getLine()) + "-";
-      }
-    }
-    if (name.empty())
-      name = "NOTSET";
-    else
-      name.pop_back(); // remove last "-"
-    return name;
-  }
-  if (auto fileLineColLoc = loc.dyn_cast<FileLineColLoc>()) {
-    LLVM_DEBUG(llvm::dbgs() << "file line col has name\n");
-    std::string filename =
-        llvm::sys::path::filename(fileLineColLoc.getFilename().str()).str();
-    std::string name =
-        filename + ":" + std::to_string(fileLineColLoc.getLine());
-    return name;
-  }
-  return "NOTSET";
-}
-
 void impl::onnxToKrnlParallelReport(Operation *op, bool successful,
     int64_t loopLevel, int64_t parallelLoopTripCount,
     const std::string &comment) {
   assert(OnnxToKrnlLoweringConfiguration::reportOnParallel && "must report");
   assert(comment.find(',') == std::string::npos && "no comma in comments");
   StringAttr opName = op->getName().getIdentifier();
-  std::string nodeNameStr = getNodeNameLikeInKrnlInstrument(op);
+  std::string nodeNameStr = getNodeNameInPresenceOfOpt(op);
   // Print report on this op.
   printf("==PAR-REPORT==, %s%s, %s, %s, %lld, %lld\n", opName.data(),
       (successful ? "-parallel" : ""), nodeNameStr.c_str(), comment.c_str(),
@@ -736,7 +682,7 @@ void impl::onnxToKrnlSimdReport(Operation *op, bool successful,
   assert(OnnxToKrnlLoweringConfiguration::reportOnSimd && "must report");
   assert(comment.find(',') == std::string::npos && "no comma in comments");
   StringAttr opName = op->getName().getIdentifier();
-  std::string nodeNameStr = getNodeNameLikeInKrnlInstrument(op);
+  std::string nodeNameStr = getNodeNameInPresenceOfOpt(op);
   // Handling message.
   std::string message = OnnxToKrnlLoweringConfiguration::defaultSimdComment;
   if (message.empty())
diff --git a/src/Conversion/ONNXToKrnl/Tensor/PrintSignature.cpp b/src/Conversion/ONNXToKrnl/Tensor/PrintSignature.cpp
index c7c813b47f..0ad00c12df 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/PrintSignature.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/PrintSignature.cpp
@@ -33,15 +33,30 @@ struct ONNXPrintSignatureLowering
     Location loc = ONNXLoc<ONNXPrintSignatureOp>(op);
     MultiDialectBuilder<KrnlBuilder> create(rewriter, loc);
 
+    // First message.
     std::string opName(printSignatureOp.getOpName().data());
-    create.krnl.printf(opName);
-    std::string msg = "%t ";
+    std::string msg =
+        "%i==SIG-REPORT==, " + opName + ", sig"; // meaningless secondary key.
+    // Discover the values to print, setting aside the last one.
+    llvm::SmallVector<Value, 4> printVal;
     for (Value oper : adaptor.getInput())
       if (!oper.getType().isa<NoneType>())
-        create.krnl.printTensor(msg, oper);
-    Value noneValue;
-    rewriter.replaceOpWithNewOp<KrnlPrintOp>(op, "\n", noneValue);
-    // For debug, no need to report on SIMD.
+        printVal.emplace_back(oper);
+    int64_t printNum = printVal.size();
+    Value lastVal = printVal.pop_back_val();
+    // Print all but the last one.
+    for (Value oper : printVal) {
+      create.krnl.printTensor(msg + ", %t%e", oper);
+      msg = "%i";
+    }
+    // Print the last one with replace with new op.
+    if (printNum > 0) {
+      rewriter.replaceOpWithNewOp<KrnlPrintTensorOp>(
+          op, msg + ", %t\n%e", lastVal);
+    } else {
+      Value noneVal = nullptr;
+      rewriter.replaceOpWithNewOp<KrnlPrintTensorOp>(op, msg + "\n%e", noneVal);
+    }
     return success();
   }
 };
diff --git a/src/Dialect/ONNX/ONNXOps/OpHelper.cpp b/src/Dialect/ONNX/ONNXOps/OpHelper.cpp
index 0ccb098f5b..809a238b48 100644
--- a/src/Dialect/ONNX/ONNXOps/OpHelper.cpp
+++ b/src/Dialect/ONNX/ONNXOps/OpHelper.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Path.h"
 
 #include "src/Dialect/Mlir/IndexExpr.hpp"
 #include "src/Dialect/ONNX/ONNXLayoutHelper.hpp"
@@ -676,4 +677,57 @@ int64_t mlirTypeToOnnxType(Type elemType) {
   return onnxType;
 }
 
+//===----------------------------------------------------------------------===//
+// Support for location.
+//===----------------------------------------------------------------------===//
+
+// We may try to relate the node names generated by the instrumentation
+// with the node names printed by opt-report. Thus it is key to keep the
+// code that generates these node name in sync.
+//
+// The code are found here:
+// 1) `matchAndRewrite` from `src/Conversion/KrnlToLLVM/KrnlInstrument.cpp`
+// 2) `getNodeNameInPresenceOfOpt` from
+//    `src/Dialect/ONNX/ONNXOps/OpHelper.cpp`
+
+std::string getNodeNameInPresenceOfOpt(Operation *op) {
+  StringAttr nodeName;
+  // Try with op onnx_node_name attribute.
+  nodeName = op->getAttrOfType<StringAttr>("onnx_node_name");
+  if (nodeName) {
+    return nodeName.str();
+  }
+  // Try with op location.
+  Location loc = op->getLoc();
+  if (auto nameLoc = loc.dyn_cast<NameLoc>()) {
+    return nameLoc.getName().str();
+  }
+  if (auto fusedLoc = loc.dyn_cast<FusedLoc>()) {
+    // Combine each location name and set it as nodeName.
+    std::string name;
+    for (Location locIt : fusedLoc.getLocations()) {
+      if (auto nameLocIt = locIt.dyn_cast<NameLoc>())
+        name += nameLocIt.getName().str() + "-";
+      else if (auto fileLineColLoc = locIt.dyn_cast<FileLineColLoc>()) {
+        std::string filename =
+            llvm::sys::path::filename(fileLineColLoc.getFilename().str()).str();
+        name += filename + ":" + std::to_string(fileLineColLoc.getLine()) + "-";
+      }
+    }
+    if (name.empty())
+      name = "NOTSET";
+    else
+      name.pop_back(); // remove last "-"
+    return name;
+  }
+  if (auto fileLineColLoc = loc.dyn_cast<FileLineColLoc>()) {
+    std::string filename =
+        llvm::sys::path::filename(fileLineColLoc.getFilename().str()).str();
+    std::string name =
+        filename + ":" + std::to_string(fileLineColLoc.getLine());
+    return name;
+  }
+  return "NOTSET";
+}
+
 } // namespace onnx_mlir
diff --git a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
index 06a3479981..2b8aa7d360 100644
--- a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
+++ b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
@@ -261,4 +261,10 @@ bool areDimsFromConcat(mlir::Value val);
 /// Get all dimensions that are stored by the value.
 void getDims(mlir::Value val, llvm::SmallVectorImpl<mlir::Value> &dims);
 
+//===----------------------------------------------------------------------===//
+// Support for location.
+//===----------------------------------------------------------------------===//
+
+std::string getNodeNameInPresenceOfOpt(mlir::Operation *op);
+
 } // namespace onnx_mlir
diff --git a/src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp b/src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp
index b642f2602b..beb50392ef 100644
--- a/src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp
+++ b/src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp
@@ -895,4 +895,5 @@ void SaveOnnxAttrInOp(mlir::Operation *op,
   OP_TYPE specificOp = llvm::cast<OP_TYPE>(op);
   setAttr(specificOp, newAttr);
 }
+
 } // namespace onnx_mlir
diff --git a/src/Runtime/OMInstrument.inc b/src/Runtime/OMInstrument.inc
index d61cebd2bc..92cae0c656 100644
--- a/src/Runtime/OMInstrument.inc
+++ b/src/Runtime/OMInstrument.inc
@@ -80,27 +80,27 @@ static FILE *fout = 0;
 #endif
 
 #ifdef _WIN32
-void TimeInit() {
+static void TimeInit() {
   QueryPerformanceFrequency(&perfFrequency);
   QueryPerformanceCounter(&globalTime);
   initTime = globalTime;
 }
 #else
-void TimeInit() {
+static void TimeInit() {
   gettimeofday(&globalTimeVal, NULL);
   initTimeVal = globalTimeVal;
 }
 #endif
 
 #ifdef _WIN32
-inline void WinTimerSub(LARGE_INTEGER newTime, LARGE_INTEGER prevTime,
+static inline void WinTimerSub(LARGE_INTEGER newTime, LARGE_INTEGER prevTime,
     LONGLONG *resultSeconds, LONGLONG *resultMicroseconds) {
   LONGLONG elapsed = newTime.QuadPart - prevTime.QuadPart;
   *resultSeconds = elapsed / perfFrequency.QuadPart;
   *resultMicroseconds =
       ((elapsed * 1000000) / perfFrequency.QuadPart) % 1000000;
 }
-void ReportTime() {
+static void ReportTime() {
   LARGE_INTEGER newTime;
   LONGLONG resultSeconds, resultMicroseconds;
   QueryPerformanceCounter(&newTime);
@@ -111,7 +111,7 @@ void ReportTime() {
   globalTime = newTime;
 }
 #else
-void ReportTime() {
+static void ReportTime() {
   struct timeval newTimeValue, result;
   gettimeofday(&newTimeValue, NULL);
   timersub(&newTimeValue, &globalTimeVal, &result);
@@ -125,7 +125,7 @@ void ReportTime() {
 #endif
 
 #ifdef _WIN32
-void ReportMemory() {
+static void ReportMemory() {
   PROCESS_MEMORY_COUNTERS_EX pmc;
   GetProcessMemoryInfo(
       GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS *)&pmc, sizeof(pmc));
@@ -133,7 +133,7 @@ void ReportMemory() {
   fprintf(fout, "%zu\n", vMemSizeKB);
 }
 #else
-void ReportMemory() {
+static void ReportMemory() {
   char memCommand[200];
   char memOutput[200];
   FILE *memPipe;
@@ -159,6 +159,21 @@ void ReportMemory() {
 }
 #endif
 
+FILE *getInstrumentFile() {
+  if (fout)
+    return fout;
+  fout = stdout;
+  if (getenv("ONNX_MLIR_INSTRUMENT_FILE")) {
+    char *fileName = getenv("ONNX_MLIR_INSTRUMENT_FILE");
+    FILE *newFileHandle = fopen(fileName, "w");
+    if (newFileHandle) {
+      fout = newFileHandle;
+    }
+  }
+  assert(fout);
+  return fout;
+}
+
 void OMInstrumentInit() {
   // Read environment variables.
   if (getenv("ONNX_MLIR_NO_INSTRUMENT_TIME")) {
@@ -172,14 +187,7 @@ void OMInstrumentInit() {
     instrumentReportDisabled = true;
   }
   // Handle redirection to file if requested.
-  fout = stdout;
-  if (getenv("ONNX_MLIR_INSTRUMENT_FILE")) {
-    char *fileName = getenv("ONNX_MLIR_INSTRUMENT_FILE");
-    FILE *newFileHandle = fopen(fileName, "w");
-    if (newFileHandle) {
-      fout = newFileHandle;
-    }
-  }
+  getInstrumentFile();
 
   // Init as appropriate.
   if (!instrumentReportDisabled) {
diff --git a/src/Runtime/OMTensor.inc b/src/Runtime/OMTensor.inc
index c5b15bca69..19eb594b60 100644
--- a/src/Runtime/OMTensor.inc
+++ b/src/Runtime/OMTensor.inc
@@ -34,9 +34,9 @@
 #include <malloc.h>
 #endif
 
+#include <errno.h>
 #include <stdio.h>
 #include <string.h>
-#include <errno.h>
 
 #include "onnx-mlir/Runtime/OMTensor.h"
 
@@ -49,7 +49,7 @@
 #if defined(__APPLE__)
 // __errno_location() is called from krnl::emitErrNo() to set errno in
 // generated llvm IR, but it somehow doesn't come out of the box on MacOS.
-int * __attribute__((__weak__)) __errno_location(void) { return &errno; }
+int *__attribute__((__weak__)) __errno_location(void) { return &errno; }
 #endif
 
 // On some platforms LLVM generates f16 conversion code that calls some
@@ -141,62 +141,6 @@ static inline int64_t getNumElems(const int64_t *shape, int64_t rank) {
   return numElem;
 }
 
-/* Helper function to get the ONNX data type C name string */
-static inline const char *getDataTypeName(OM_DATA_TYPE dataType) {
-  return OM_DATA_TYPE_NAME[dataType];
-}
-
-/* Helper function to print an element of a tensor */
-static inline void printElement(
-    void *dataPtr, int64_t elemOffset, OM_DATA_TYPE dataType) {
-  assert(dataPtr && "Expecting a valid data pointer");
-  assert(elemOffset >= 0 && "Expecting a valid element offset");
-
-  switch (dataType) {
-  case ONNX_TYPE_BOOL:
-    printf("%d", ((bool *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_UINT8:
-    printf("%hhu", ((uint8_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_INT8:
-    printf("%hhd", ((int8_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_UINT16:
-    printf("%hu", ((uint16_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_INT16:
-    printf("%hd", ((int16_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_UINT32:
-    printf("%u", ((uint32_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_INT32:
-    printf("%d", ((int32_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_UINT64:
-    printf("%llu", (unsigned long long)((uint64_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_INT64:
-    printf("%lld", (long long)((int64_t *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_FLOAT16:
-    printf("%f", om_f16_to_f32(((uint16_t *)dataPtr)[elemOffset]));
-    break;
-  case ONNX_TYPE_FLOAT:
-    printf("%f", ((float *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_DOUBLE:
-    printf("%f", ((double *)dataPtr)[elemOffset]);
-    break;
-  case ONNX_TYPE_STRING:
-    printf("%s", ((const char **)dataPtr)[elemOffset]);
-    break;
-  default:
-    assert(false && "unexpected data type");
-  }
-}
-
 /* Helper function to compute the inner product of 2 arrays (of equal size) */
 static inline int64_t inner_product(
     const int64_t v[], const int64_t u[], int64_t len, int64_t init) {
@@ -345,7 +289,9 @@ void omTensorSetDataPtr(
 }
 
 /* OMTensor data shape getter */
-const int64_t *omTensorGetShape(const OMTensor *tensor) { return tensor->_shape; }
+const int64_t *omTensorGetShape(const OMTensor *tensor) {
+  return tensor->_shape;
+}
 
 /* OMTensor data shape setter */
 void omTensorSetShape(OMTensor *tensor, const int64_t *shape) {
@@ -354,7 +300,9 @@ void omTensorSetShape(OMTensor *tensor, const int64_t *shape) {
 }
 
 /* OMTensor data strides getter */
-const int64_t *omTensorGetStrides(const OMTensor *tensor) { return tensor->_strides; }
+const int64_t *omTensorGetStrides(const OMTensor *tensor) {
+  return tensor->_strides;
+}
 
 /* OMTensor data strides setter */
 void omTensorSetStrides(OMTensor *tensor, const int64_t *strides) {
@@ -432,131 +380,232 @@ void *omTensorGetAllocatedPtr(const OMTensor *tensor) {
   return tensor->_allocatedPtr;
 }
 
-void omTensorPrint(const char *msg, const OMTensor *tensor) {
-  assert(tensor && "attempt to print a null OMTensor");
+//===----------------------------------------------------------------------===//
+/* omTensorPrint helper */
+
+/* Helper function to get the ONNX data type C name string */
+static inline const char *getDataTypeName(OM_DATA_TYPE dataType) {
+  return OM_DATA_TYPE_NAME[dataType];
+}
 
+/* Helper function to print type as 8x16x1xfloat */
+static void printType(FILE *fout, const OMTensor *tensor) {
+  const OM_DATA_TYPE dataType = omTensorGetDataType(tensor);
+  const int64_t rank = omTensorGetRank(tensor);
+  const int64_t *shape = omTensorGetShape(tensor);
+  for (int64_t i = 0; i < rank; i++)
+    fprintf(fout, "%lldx", (long long)shape[i]);
+  fprintf(fout, "%s", getDataTypeName(dataType));
+}
+
+/* Helper function to print a detailed signature of a vector */
+static void printSignature(FILE *fout, const OMTensor *tensor) {
   const OM_DATA_TYPE dataType = omTensorGetDataType(tensor);
   const int64_t rank = omTensorGetRank(tensor);
   const int64_t *shape = omTensorGetShape(tensor);
   const int64_t *strides = omTensorGetStrides(tensor);
-  void *dataPtr = omTensorGetDataPtr(tensor);
 
-  // Process format if any (default is signature and data).
-  bool signatureInfo = false;
-  bool typeInfo = false;
-  bool dataInfo = false;
-  bool hadFormat = false;
-  while (strlen(msg) >= 2 && msg[0] == '%') {
-    if (msg[1] == 's') // S for signature.
-      hadFormat = signatureInfo = true;
-    else if (msg[1] == 't') // T for type only
-      hadFormat = typeInfo = true;
-    else if (msg[1] == 'd')
-      hadFormat = dataInfo = true;
-    else
-      printf("warning: unknown format `%c`\n", msg[1]);
-    // Skip the format when printing message.
-    msg += 2;
-  }
-  if (!hadFormat)
-    signatureInfo = dataInfo = true;
-
-  // Print message.
-  if (msg)
-    printf("%s", msg);
-
-  if (signatureInfo) {
-    // Print detailed info.
-    printf("\trank = %lld\n", (long long)rank);
-    printf("\tdataType = %s\n", getDataTypeName(dataType));
-    printf("\tnumElems = %lld\n", (long long)omTensorGetNumElems(tensor));
-    printf("\tshape: ");
-    for (int64_t i = 0; i < rank; i++)
-      printf("[%lld]", (long long)shape[i]);
-    printf("\n");
-    printf("\tstrides: ");
-    for (int64_t i = 0; i < rank; i++)
-      printf("[%lld]", (long long)strides[i]);
-    printf("\n");
-  }
-  if (typeInfo) {
-    // Print compact type like in MLIR, e.g. 32x16xf32.
-    if (signatureInfo)
-      printf("\ttype: ");
-    for (int64_t i = 0; i < rank; i++)
-      printf("%lldx", (long long)shape[i]);
-    printf("%s", getDataTypeName(dataType));
-    if (signatureInfo)
-      printf("\n");
+  fprintf(fout, "\trank = %lld\n", (long long)rank);
+  fprintf(fout, "\tdataType = %s\n", getDataTypeName(dataType));
+  fprintf(fout, "\tnumElems = %lld\n", (long long)omTensorGetNumElems(tensor));
+  fprintf(fout, "\tshape: ");
+  for (int64_t i = 0; i < rank; i++)
+    fprintf(fout, "[%lld]", (long long)shape[i]);
+  fprintf(fout, "\n");
+  fprintf(fout, "\tstrides: ");
+  for (int64_t i = 0; i < rank; i++)
+    fprintf(fout, "[%lld]", (long long)strides[i]);
+  fprintf(fout, "\n");
+
+  fprintf(fout, "\ttype: ");
+  printType(fout, tensor);
+  fprintf(fout, "\n");
+}
+
+/* Helper function to print an element of a tensor */
+static void printElement(
+    FILE *fout, void *dataPtr, int64_t elemOffset, OM_DATA_TYPE dataType) {
+  assert(dataPtr && "Expecting a valid data pointer");
+  assert(elemOffset >= 0 && "Expecting a valid element offset");
+
+  switch (dataType) {
+  case ONNX_TYPE_BOOL:
+    fprintf(fout, "%d", ((bool *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_UINT8:
+    fprintf(fout, "%hhu", ((uint8_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_INT8:
+    fprintf(fout, "%hhd", ((int8_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_UINT16:
+    fprintf(fout, "%hu", ((uint16_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_INT16:
+    fprintf(fout, "%hd", ((int16_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_UINT32:
+    fprintf(fout, "%u", ((uint32_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_INT32:
+    fprintf(fout, "%d", ((int32_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_UINT64:
+    fprintf(
+        fout, "%llu", (unsigned long long)((uint64_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_INT64:
+    fprintf(fout, "%lld", (long long)((int64_t *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_FLOAT16:
+    fprintf(fout, "%f", om_f16_to_f32(((uint16_t *)dataPtr)[elemOffset]));
+    break;
+  case ONNX_TYPE_FLOAT:
+    fprintf(fout, "%f", ((float *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_DOUBLE:
+    fprintf(fout, "%f", ((double *)dataPtr)[elemOffset]);
+    break;
+  case ONNX_TYPE_STRING:
+    fprintf(fout, "%s", ((const char **)dataPtr)[elemOffset]);
+    break;
+  default:
+    assert(false && "unexpected data type");
   }
+}
+
+/* Helper function to print the data of a tensor type */
+static void printData(FILE *fout, const OMTensor *tensor) {
 
+/* Helper macros to print data for 1-4D tensors  */
 #define LOOP_1(INDEX, IV, UB)                                                  \
-  printf("[");                                                                 \
+  fprintf(fout, "[");                                                          \
   for (int64_t IV = 0; IV < UB; ++IV) {                                        \
     if (IV)                                                                    \
-      printf(", ");                                                            \
+      fprintf(fout, ", ");                                                     \
     indexes[INDEX] = IV;                                                       \
     int64_t elemOffset = computeElemOffset(tensor->_strides, indexes, rank);   \
-    printElement(dataPtr, elemOffset, dataType);                               \
+    printElement(fout, dataPtr, elemOffset, dataType);                         \
   }                                                                            \
-  printf("]");
+  fprintf(fout, "]");
 
 #define LOOP_2(INDEX, IV, UB, ...)                                             \
-  printf("[");                                                                 \
+  fprintf(fout, "[");                                                          \
   for (int64_t IV = 0; IV < UB; ++IV) {                                        \
     if (IV)                                                                    \
-      printf(", ");                                                            \
+      fprintf(fout, ", ");                                                     \
     indexes[INDEX] = IV;                                                       \
     LOOP_1(INDEX + 1, __VA_ARGS__)                                             \
   }                                                                            \
-  printf("]");
+  fprintf(fout, "]");
 
 #define LOOP_3(INDEX, IV, UB, ...)                                             \
-  printf("[");                                                                 \
+  fprintf(fout, "[");                                                          \
   for (int64_t IV = 0; IV < UB; ++IV) {                                        \
     if (IV)                                                                    \
-      printf(", ");                                                            \
+      fprintf(fout, ", ");                                                     \
     indexes[INDEX] = IV;                                                       \
     LOOP_2(INDEX + 1, __VA_ARGS__)                                             \
   }                                                                            \
-  printf("]");
+  fprintf(fout, "]");
 
 #define LOOP_4(INDEX, IV, UB, ...)                                             \
-  printf("[");                                                                 \
+  fprintf(fout, "[");                                                          \
   for (int64_t IV = 0; IV < UB; ++IV) {                                        \
     if (IV)                                                                    \
-      printf(", ");                                                            \
+      fprintf(fout, ", ");                                                     \
     indexes[INDEX] = IV;                                                       \
     LOOP_3(INDEX + 1, __VA_ARGS__)                                             \
   }                                                                            \
-  printf("]");
-
-  if (dataInfo) {
-    printf("\tdata: (");
-    switch (rank) {
-    case 0:
-      printElement(dataPtr, 0, dataType);
-      break;
-    case 1: {
-      int64_t indexes[1];
-      LOOP_1(0, i, shape[0])
-    } break;
-    case 2: {
-      int64_t indexes[2];
-      LOOP_2(0, i, shape[0], j, shape[1])
-    } break;
-    case 3: {
-      int64_t indexes[3];
-      LOOP_3(0, i, shape[0], j, shape[1], k, shape[2])
-    } break;
-    case 4: {
-      int64_t indexes[4];
-      LOOP_4(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3])
-    } break;
-    default:
-      assert(false && "not implemented");
+  fprintf(fout, "]");
+
+  const OM_DATA_TYPE dataType = omTensorGetDataType(tensor);
+  const int64_t rank = omTensorGetRank(tensor);
+  const int64_t *shape = omTensorGetShape(tensor);
+  void *dataPtr = omTensorGetDataPtr(tensor);
+
+  fprintf(fout, "\tdata: (");
+  switch (rank) {
+  case 0:
+    printElement(fout, dataPtr, 0, dataType);
+    break;
+  case 1: {
+    int64_t indexes[1];
+    LOOP_1(0, i, shape[0])
+  } break;
+  case 2: {
+    int64_t indexes[2];
+    LOOP_2(0, i, shape[0], j, shape[1])
+  } break;
+  case 3: {
+    int64_t indexes[3];
+    LOOP_3(0, i, shape[0], j, shape[1], k, shape[2])
+  } break;
+  case 4: {
+    int64_t indexes[4];
+    LOOP_4(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3])
+  } break;
+  default:
+    assert(false && "not implemented");
+  }
+  fprintf(fout, ")\n");
+}
+
+/* Defined in OMInstrument.inc, return stdout or the FILE handle to where
+ * instrumentation should be printed. */
+FILE *getInstrumentFile();
+
+/* Print a string and possibly some info about a tensor. With `%d`, we print the
+tensor data; with `s`, we print an extended signature; with `t`, we print the
+type of the tensor (e.g. 32 x 16 x float)
+
+There are two other unpublished modifiers if the message starts with `%i`, this
+signal an instrumentation message that may be redirected to the file name given
+by ONNX_MLIR_INSTRUMENT_FILE. Additionally, there is a `%e` modifier that
+signals the end of the message string. This is unfortunately needed because of
+current issues with the termination of strings.
+*/
+void omTensorPrint(const char *msg, const OMTensor *tensor) {
+  /* Process the message string. */
+  int64_t len = strlen(msg);
+  FILE *fout = stdout;
+  /* Check if this is an instrumentation print, in which case we use the
+   * instrument file instead of stdout. */
+  if (len >= 2 && msg[0] == '%' && msg[1] == 'i') {
+    fout = getInstrumentFile();
+    /* Skip format. */
+    msg += 2;
+    len -= 2;
+  }
+  while (len > 0) {
+    if (msg[0] == '%') {
+      if (len < 2) {
+        fprintf(fout, "omTensorPrint: orphan `%%`, skip.\n");
+        return;
+      }
+      if (msg[1] == 'd') { /* Letter `d` for data. */
+        assert(tensor && "attempt to print a null OMTensor");
+        printData(fout, tensor);
+      } else if (msg[1] == 's') { /* Letter `s` for signature. */
+        assert(tensor && "attempt to print a null OMTensor");
+        printSignature(fout, tensor);
+      } else if (msg[1] == 't') { /* Letter `t` for type only. */
+        assert(tensor && "attempt to print a null OMTensor");
+        printType(fout, tensor);
+      } else if (msg[1] == 'e') { /* Letter `e` for end. */
+        return;
+      } else {
+        fprintf(fout, "omTensorPrint: unknown format `%c`, skip.\n", msg[1]);
+      }
+      /* Skip the format when printing message. */
+      msg += 2;
+      len -= 2;
+      continue;
     }
-    printf(")\n");
+    /* No format, just print letter. */
+    fprintf(fout, "%c", msg[0]);
+    msg++;
+    len--;
   }
 }
 
diff --git a/src/Transform/ONNX/InstrumentONNXSignaturePass.cpp b/src/Transform/ONNX/InstrumentONNXSignaturePass.cpp
index 82c4f21a7e..94b4bc8449 100644
--- a/src/Transform/ONNX/InstrumentONNXSignaturePass.cpp
+++ b/src/Transform/ONNX/InstrumentONNXSignaturePass.cpp
@@ -28,6 +28,7 @@
 #include "src/Dialect/Krnl/DialectBuilder.hpp"
 #include "src/Dialect/Krnl/KrnlOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
+#include "src/Dialect/ONNX/ONNXOps/OpHelper.hpp"
 #include "src/Interface/ShapeInferenceOpInterface.hpp"
 #include "src/Pass/Passes.hpp"
 
@@ -70,20 +71,18 @@ class InstrumentONNXSignaturePass
         if (!isa<ONNXPrintSignatureOp>(op)) {
           Location loc = op->getLoc();
           OpBuilder builder(op);
-          ValueRange operands = op->getOperands();
-          std::string opName(
-              op->getName().getStringRef().data() + 5); // Skip "onnx.".
-          StringAttr opNameAttr = builder.getStringAttr(opName);
-          if (isa<ONNXConstantOp>(op)) {
-            // Constant has the type in the output, so use it.
-            operands = op->getResults();
-            // Since we use the result of the constant operation, we must insert
-            // the print operation after the constant operation.
-            builder.setInsertionPointAfter(op);
-            builder.create<ONNXPrintSignatureOp>(loc, opNameAttr, operands);
-          } else if (operands.size() > 0) {
-            builder.create<ONNXPrintSignatureOp>(loc, opNameAttr, operands);
-          }
+          std::string opName = op->getName().getStringRef().str();
+          std::string nodeName = onnx_mlir::getNodeNameInPresenceOfOpt(op);
+          std::string fullName = opName + ", " + nodeName;
+          StringAttr fullNameAttr = builder.getStringAttr(fullName);
+          // Enqueue all input operands, and then the results.
+          llvm::SmallVector<Value, 6> operAndRes(op->getOperands());
+          for (Value res : op->getResults())
+            operAndRes.emplace_back(res);
+          // Since we may use the result of an operation, we must insert the
+          // print operation after the operation.
+          builder.setInsertionPointAfter(op);
+          builder.create<ONNXPrintSignatureOp>(loc, fullNameAttr, operAndRes);
         }
       }
     });
diff --git a/utils/make-report.py b/utils/make-report.py
index e5908a61a1..cf12ae86a7 100755
--- a/utils/make-report.py
+++ b/utils/make-report.py
@@ -6,14 +6,26 @@
 #
 # Copyright 2023 The IBM Research Authors.
 #
-################################################################################
+#############################################################################
 #
-# This file scan -opt-report=* and process it
+# This file scan --opt-report=*, --profile-ir, --instrument-onnx-signature
+# and process it.
 #
-# For patterns, see src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp, 
+# For patterns, see src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp,
 # impl::onnxToKrnlParallelReport(...) and impl::onnxToKrnlSimdReport(...)
 #
-################################################################################
+# Format for stats:
+# '=='<STAT>'==,' <qualified op name> ',' <node name> ',' <secondary key>
+#     {',' <info>}
+#
+# where <qualified op name> may have a '-simd' or '-par' suffix to indicate
+# the success of that pass.
+#
+# Format of perf:
+# '==PERF==,' <op name> ',' <node name> ',' <secondary key> ','
+#     <elapsed time in float> ',' <absolute time in f>
+#
+#############################################################################
 
 import sys
 import getopt
@@ -23,8 +35,8 @@
 def print_usage(msg = ""):
     if msg:
         print("Error:", msg, "\n")
-    print("make-report.py -[svh] [-c <compile_log>] [-r <run_log>] [-l <num>]")
-    print("  [--sort <val>] [-u <val>] [-p <op regexp>]")
+    print("make-report.py -[vh] [-c <compile_log>] [-r <run_log>] [-l <num>]")
+    print("  [-s <stats>] [--sort <val>] [--supported] [-u <val>] [-p <op regexp>]")
     print("")
     print("Usage: Report statistics on compiler and runtime characteristics of onnx ops.")
     print("")
@@ -43,20 +55,29 @@ def print_usage(msg = ""):
     print("  Use `-l 3` to correlate the node name printed here with compiler output.")
     print("")
     print("Parameters:")
-    print("  -c/--compile <file_log>: File name containing the compile time statistics.")
-    print("  -r/--runtime <file_log>: File name containing the runtime statistics.")
-    print("  -l/--level <num>:    Print statistics:")
-    print("                       0: Just count successful/unsuccessful ops.")
+    print("  -c/--compile <file_log>: File name containing the compile-time statistics")
+    print("                       or runtime signature statistics.")
+    print("  -r/--runtime <file_log>: File name containing the runtime time statistics.")
+    print("  -a/--stats <name>:   Print specific statistics:")
+    print("                       simd: Print simd optimization stats.")
+    print("                       Default if a compile time file is given.")
+    print("                       par: Print parallel optimization stats.")
+    print("                       sig: Print signatures of op.")
+    print("                       perf: Print runtime execution time of ops.")
+    print("                       Default if no compile time file is given.")
+    print("  -l/--level <num>:    Print detailed level of statistics:")
+    print("                       0: Just count successful/unsuccessful ops (default).")
     print("                       1: Also count reasons for success/failure.")
     print("                       2: Also list metrics.")
     print("                       3: Also list node name.")
     print("  -f/--focus <regexp>: Focus only on ops that match the regexp pattern.")
-    print("  -s/supported:        Focus only on ops that are supported. Namely, the report")
+    print("  -supported:          Focus only on ops that are supported. Namely, the report")
     print("                       will skip ops for which compile-time statistics list")
     print("                       the 'unsupported' keyword in its printout.")
     print("                       For SIMD/parallel statistics, this include all ops that")
     print("                       have currently no support for it.")
-    print("  -u/--unit <str>:     Time in second ('s'), millisecond ('ms') or microsecond ('us).")
+    print("  -u/--unit <str>:     Time in second ('s', default), millisecond ('ms') or")
+    print("                       microsecond ('us).")
     print("  --sort <str>:        Sort output by op 'name', occurrence 'num' or `time`.")
     print("  -v/--verbose:        Run in verbose mode (see error and warnings).")
     print("  -h/--help:           Print usage.")
@@ -92,10 +113,11 @@ def common_report_str(stat_name):
     return r'^==' + stat_name + r'-REPORT==,\s*([0-9a-zA-Z\.\-]+)\s*,\s*([^,]*),\s*(.*)'
 
 # ==SIMD-REPORT==, ..., <explanations>, <VL>, <simd-trip-count>
-simd_stat_message = "message, SIMD vector length (in elements), SIMD loop trip count (-1 is runtime)"
+simd_legend = "message, SIMD vector length (in elements), SIMD loop trip count (-1 is runtime)"
+sig_legend = "comma separated list of shapes with inputs followed by results"
 
 # ==PERF-REPORT==, ..., "before" | "after", time since last call, absolute time
-perf_stat_message = "(after|before), time for op(s), time since start(s)"
+perf_legend = "(after|before), time for op(s), time since start(s)"
 
 ################################################################################
 # # Support.
@@ -307,6 +329,14 @@ def get_percent(n, d):
         return 0.0
     return n * 100 / d
 
+def get_sorting_key(count, name, time):
+    global sorting_preference
+    if sorting_preference == "num":
+        key = - count
+    if sorting_preference == "name":
+        return name
+    return - time
+
 def make_report(stat_message):
     global op_count_dict, op_detail_count_dict
     global op_time_dict, op_detail_time_dict, tot_time
@@ -326,33 +356,40 @@ def make_report(stat_message):
             count_time_str += ", {:.7f}".format(time * time_unit)
             count_time_str += ", {:.1f}%".format(get_percent(time, tot_time))
         output = "  " + op + ", " + count_time_str
-        if sorting_preference == "name":
-            key = op
-        elif sorting_preference == "num":
-            key = - count
-        else:
-            key = - time
         if report_level:
             det_dict = op_detail_count_dict[op]
             det_time_dict = {}
+            sorted_det_output = {}
             if op in op_detail_time_dict:
                 det_time_dict = op_detail_time_dict[op]
-            for det_key in sorted(det_dict):
+            for det_key in det_dict:
                 det_count = det_dict[det_key]
                 if det_count == count:
                     count_time_str = "*"
                 else:
                     count_time_str = str(det_count)
+                det_time = 0
                 if det_key in det_time_dict:
                     det_time = np.sum(det_time_dict[det_key])
                     count_time_str += ", {:.7f}".format(det_time * time_unit / det_count)
                     count_time_str += ", {:.7f}".format(det_time * time_unit)
                     count_time_str += ", {:.1f}%".format(get_percent(det_time, time))
-                output += "\n    " + count_time_str + ": " + det_key
-        if key in sorted_output:
-            sorted_output[key] = sorted_output[key] + "\n" + output
+
+                det_output = "\n    " + count_time_str + ": " + det_key
+                det_output_key = get_sorting_key(det_count, det_key, det_time)
+                if det_output_key in sorted_det_output:
+                    sorted_det_output[det_output_key] += det_output
+                else:
+                    sorted_det_output[det_output_key] = det_output
+            for key in sorted(sorted_det_output):
+                output += sorted_det_output[key]
+
+        # add output to sorted_output
+        output_key = get_sorting_key(count, op, time)
+        if output_key in sorted_output:
+            sorted_output[output_key] += "\n" + output
         else:
-            sorted_output[key] = output
+            sorted_output[output_key] = output
 
     # Print legend and stats.
     num_desc = "num"
@@ -409,11 +446,13 @@ def main(argv):
 
     compile_file_name = ""
     runtime_file_name = ""
+    make_stats = ""
+    make_legend = ""
     try:
         opts, args = getopt.getopt(
-            argv, "c:f:hl:r:su:v",
-            ["compile=", "focus=", "help", "level=", "runtime=", "supported",
-             "sort=", "unit=", "verbose"])
+            argv, "c:f:hl:r:s:u:v",
+            ["compile=", "focus=", "help", "level=", "runtime=", "stats="
+             "sort=", "supported", "unit=", "verbose"])
     except getopt.GetoptError:
         print_usage("Failure to parse inputs")
     for opt, arg in opts:
@@ -431,7 +470,22 @@ def main(argv):
                 print_usage("detail levels are 0, 1, 2, or 3")
         elif opt in ('-r', "--runtime"):
             runtime_file_name = arg
-        elif opt in ('-s', "--supported"):
+        elif opt in ("-s", "--stats"):
+            if re.match(r'\s*par\s*', arg):
+                make_stats = "PAR"
+                make_legend = "none"
+            elif re.match(r'\s*perf\s*', arg):
+                make_stats = "PERF"
+                make_legend = perf_legend
+            elif re.match(r'\s*sig?\s*', arg):
+                make_stats = "SIG"
+                make_legend = sig_legend
+            elif re.match(r'\s*simd\s*', arg):
+                make_stats = "SIMD"
+                make_legend = simd_legend
+            else:
+                print_usage("statistics options are 'par', 'signature', or 'simd'")
+        elif opt in ("--supported"):
             supported_only = True
         elif opt in ("--sort"):
             if re.match(r'\s*name\s*', arg):
@@ -454,6 +508,17 @@ def main(argv):
         elif opt in ('-v', "--verbose"):
             verbose = True
 
+    # Default stats.
+    if not make_stats:
+        if compile_file_name:
+            # Default for compile analysis.
+            make_stats = "SIMD"
+            make_legend = simd_legend
+        else:
+            # Default for perf only (no compile)
+            make_stats = "PERF"
+            make_legend = perf_legend
+    print("Analyse", make_stats)
     # Default sorting preference.
     if not sorting_preference:
         if runtime_file_name:
@@ -462,15 +527,15 @@ def main(argv):
             sorting_preference = "name"
     if compile_file_name and runtime_file_name:
         parse_file_for_perf(runtime_file_name, "PERF")
-        parse_file_for_stat(compile_file_name, "SIMD")
-        make_report(simd_stat_message)
+        parse_file_for_stat(compile_file_name, make_stats)
+        make_report(make_legend)
     elif compile_file_name:
-        parse_file_for_stat(compile_file_name, "SIMD")
-        make_report(simd_stat_message)
+        parse_file_for_stat(compile_file_name, make_stats)
+        make_report(make_legend)
     elif runtime_file_name:
         parse_file_for_perf(runtime_file_name, "PERF")
         parse_file_for_stat(runtime_file_name, "PERF")
-        make_report(perf_stat_message)
+        make_report(make_legend)
     else:
         print_usage("Command requires an input file name (compile/runtime or both).\n")