Skip to content

Commit

Permalink
Fix compiler error when onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS is enab…
Browse files Browse the repository at this point in the history
…led (#20889)

### Description
The recent [PR for int4
support](#20362) breaks
builds with the onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS option enabled.

This PR adds utility functions for debug printing of int4 tensor
statistics and data.



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
  • Loading branch information
adrianlizarraga authored Jun 1, 2024
1 parent 50ee1b0 commit 5ec7ac8
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 1 deletion.
36 changes: 35 additions & 1 deletion include/onnxruntime/core/framework/int4.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,21 @@ struct Int4x2Base {
return (num_int4_elems + 1) / 2;
}

/// <summary>
/// Copy a source buffer of 4-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
/// </summary>
/// <param name="dst">Destination buffer to store unpacked 8-bit elements</param>
/// <param name="src">Source buffer with 4-bit elements</param>
/// <returns>True on success</returns>
static bool Unpack(gsl::span<UnpackedType> dst, gsl::span<const Int4x2Base<Signed>> src) {
if (CalcNumInt4Pairs(dst.size()) != src.size()) {
return false;
}

if (src.empty()) {
return true;
}

for (size_t i = 0; i < dst.size(); i++) {
size_t r = i >> 1; // i / 2;
size_t c = i & 0x1; // i % 2;
Expand All @@ -98,11 +108,21 @@ struct Int4x2Base {
return true;
}

/// <summary>
/// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 4-bit elements (packed).
/// </summary>
/// <param name="dst">Destination buffer to store packed 4-bit elements</param>
/// <param name="src">Source buffer with 8-bit elements</param>
/// <returns>True on success</returns>
static bool Pack(gsl::span<Int4x2Base<Signed>> dst, gsl::span<const UnpackedType> src) {
if (src.empty() || (CalcNumInt4Pairs(src.size()) != dst.size())) {
if (CalcNumInt4Pairs(src.size()) != dst.size()) {
return false;
}

if (src.empty()) {
return true;
}

size_t src_i = 0;
size_t dst_i = 0;

Expand All @@ -116,6 +136,20 @@ struct Int4x2Base {

return true;
}

/// <summary>
/// Returns hierarchical indices for a packed int4 element from the given element index.
///
/// Usage:
/// Int4x2* data = ...;
/// auto indices = GetTensorElemIndices(3); // 4th int4 element
/// int8_t elem = data[indices.first].GetElem(indices.second);
/// </summary>
/// <param name="index">Index of 4-bit element</param>
/// <returns>Unpacked element</returns>
static inline std::pair<size_t, size_t> GetTensorElemIndices(size_t index) {
return {index >> 1, index & 0x1};
}
};

using Int4x2 = Int4x2Base<true>;
Expand Down
27 changes: 27 additions & 0 deletions onnxruntime/core/framework/print_tensor_statistics_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,33 @@ void PrintCommonStats(const T* data, size_t count) {
PrintValue(max);
}

#define DEF_PRINT_COMMON_STATS_INT4(INT4_TYPE) \
template <> \
inline void PrintCommonStats<INT4_TYPE>(const INT4_TYPE* data, size_t count) { \
using UnpackedType = typename INT4_TYPE::UnpackedType; \
UnpackedType min = data[0].GetElem(0); \
UnpackedType max = min; \
for (size_t i = 1; i < count; i++) { \
auto indices = INT4_TYPE::GetTensorElemIndices(i); \
auto value = data[indices.first].GetElem(indices.second); \
if (value > max) { \
max = value; \
} \
if (value < min) { \
min = value; \
} \
} \
\
std::cout << "Min="; \
PrintValue(min); \
\
std::cout << ",Max="; \
PrintValue(max); \
}

DEF_PRINT_COMMON_STATS_INT4(Int4x2)
DEF_PRINT_COMMON_STATS_INT4(UInt4x2)

template <typename T>
void PrintHalfStats(const T* data, size_t count) {
float min = data[0].ToFloat();
Expand Down
93 changes: 93 additions & 0 deletions onnxruntime/core/framework/print_tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,29 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t
std::cout << std::endl;
}

// INT4 - Print snippet of 2D tensor with shape (dim0, dim1)
#define DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(INT4_TYPE) \
template <> \
inline void PrintCpuTensorSnippet<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, \
int64_t edge_items) { \
for (int64_t i = 0; i < dim0; i++) { \
SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \
auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
for (int64_t j = 1; j < dim1; j++) { \
SKIP_NON_EDGE_ITEMS_LAST_DIM(dim1, j, edge_items); \
std::cout << ", "; \
indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 + j)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
}

DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(Int4x2)
DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(UInt4x2)

// Print snippet of 3D tensor with shape (dim0, dim1, dim2)
template <typename T>
void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim2, int64_t edge_items) {
Expand All @@ -95,6 +118,33 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t
std::cout << std::endl;
}

// INT4 - Print snippet of 3D tensor with shape (dim0, dim1, dim2)
#define DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(INT4_TYPE) \
template <> \
inline void PrintCpuTensorSnippet<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2, \
int64_t edge_items) { \
for (int64_t i = 0; i < dim0; i++) { \
SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \
for (int64_t j = 0; j < dim1; j++) { \
SKIP_NON_EDGE_ITEMS(dim1, j, edge_items); \
auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
for (int64_t k = 1; k < dim2; k++) { \
SKIP_NON_EDGE_ITEMS_LAST_DIM(dim2, k, edge_items); \
std::cout << ", "; \
indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2 + k)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
}

DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(Int4x2)
DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(UInt4x2)

// Print 2D tensor
template <typename T>
void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1) {
Expand All @@ -109,6 +159,26 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1) {
std::cout << std::endl;
}

// INT4 - Print 2D tensor
#define DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(INT4_TYPE) \
template <> \
inline void PrintCpuTensorFull<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1) { \
for (int64_t i = 0; i < dim0; i++) { \
auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
for (int64_t j = 1; j < dim1; j++) { \
std::cout << ", "; \
indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 + j)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
}

DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(Int4x2)
DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(UInt4x2)

// Print 3D tensor
template <typename T>
void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim2) {
Expand All @@ -126,6 +196,29 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim
std::cout << std::endl;
}

// INT4 - Print 3D tensor
#define DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(INT4_TYPE) \
template <> \
inline void PrintCpuTensorFull<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2) { \
for (int64_t i = 0; i < dim0; i++) { \
for (int64_t j = 0; j < dim1; j++) { \
auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
for (int64_t k = 1; k < dim2; k++) { \
std::cout << ", "; \
indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2 + k)); \
PrintValue(tensor[indices.first].GetElem(indices.second)); \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
} \
std::cout << std::endl; \
}

DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(Int4x2)
DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(UInt4x2)

template <typename T>
void PrintCpuTensor(const Tensor& tensor, int threshold = kDefaultSnippetThreshold, int edge_items = kDefaultSnippetEdgeItems) {
const auto& shape = tensor.Shape();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,34 @@ void VerifyTensorProtoFileData(const PathString& tensor_proto_path, gsl::span<co

ASSERT_EQ(gsl::span<const T>(actual_data), expected_data);
}

template <bool Signed>
void VerifyTensorProtoFileDataInt4(const PathString& tensor_proto_path,
gsl::span<const Int4x2Base<Signed>> expected_data,
gsl::span<const int64_t> shape) {
size_t num_elems = 1;
for (auto dim_val : shape) {
num_elems *= static_cast<size_t>(dim_val);
}

std::ifstream tensor_proto_stream{tensor_proto_path};

ONNX_NAMESPACE::TensorProto tensor_proto{};
ASSERT_TRUE(tensor_proto.ParseFromIstream(&tensor_proto_stream));

std::vector<Int4x2Base<Signed>> actual_data{};
actual_data.resize(expected_data.size());
ASSERT_STATUS_OK(utils::UnpackTensor(tensor_proto, Path{}, actual_data.data(), num_elems));

ASSERT_EQ(actual_data.size(), expected_data.size());

for (size_t i = 0; i < num_elems; i++) {
auto indices = Int4x2Base<Signed>::GetTensorElemIndices(i);
auto actual_val = actual_data[indices.first].GetElem(indices.second);
auto expected_val = expected_data[indices.first].GetElem(indices.second);
ASSERT_EQ(actual_val, expected_val);
}
}
} // namespace

namespace env_vars = utils::debug_node_inputs_outputs_env_vars;
Expand Down Expand Up @@ -72,5 +100,53 @@ TEST(DebugNodeInputsOutputs, BasicFileOutput) {
tester.Run();
}

// Test dumping input and output INT4 tensors to file.
TEST(DebugNodeInputsOutputs, FileOutput_Int4) {
TemporaryDirectory temp_dir{ORT_TSTR("debug_node_inputs_outputs_utils_test")};
ScopedEnvironmentVariables scoped_env_vars{
EnvVarMap{
{env_vars::kDumpInputData, "1"},
{env_vars::kDumpOutputData, "1"},
{env_vars::kNameFilter, nullopt},
{env_vars::kOpTypeFilter, nullopt},
{env_vars::kDumpDataDestination, "files"},
{env_vars::kAppendRankToFileName, nullopt},
{env_vars::kOutputDir, ToUTF8String(temp_dir.Path())},
{env_vars::kDumpingDataToFilesForAllNodesIsOk, "1"},
}};

constexpr int8_t unused_val = 0;
const std::vector<int64_t> input_shape({5, 3});
const std::vector<Int4x2> input_vals = {Int4x2(1, 2), Int4x2(3, 4), Int4x2(5, 6), Int4x2(7, 8),
Int4x2(9, 10), Int4x2(11, 12), Int4x2(13, 14), Int4x2(15, unused_val)};

const std::vector<int64_t> perm = {1, 0};
const std::vector<int64_t> expected_shape({3, 5});
const std::vector<Int4x2> expected_vals = {Int4x2(1, 4), Int4x2(7, 10), Int4x2(13, 2), Int4x2(5, 8),
Int4x2(11, 14), Int4x2(3, 6), Int4x2(9, 12), Int4x2(15, unused_val)};

OpTester tester{"Transpose", 21, kOnnxDomain};
tester.AddAttribute("perm", perm);
tester.AddInput<Int4x2>("x", input_shape, input_vals);
tester.AddOutput<Int4x2>("y", expected_shape, expected_vals);

auto verify_file_data =
[&temp_dir, &input_vals, &expected_vals, &input_shape, &expected_shape](
const std::vector<OrtValue>& fetches,
const std::string& /*provider_type*/) {
ASSERT_EQ(fetches.size(), 1u);
// check it contains a tensor
fetches[0].Get<Tensor>();
VerifyTensorProtoFileDataInt4(temp_dir.Path() + ORT_TSTR("/x.tensorproto"), gsl::make_span(input_vals),
gsl::make_span(input_shape));
VerifyTensorProtoFileDataInt4(temp_dir.Path() + ORT_TSTR("/y.tensorproto"),
gsl::make_span(expected_vals), gsl::make_span(expected_shape));
};

tester.SetCustomOutputVerifier(verify_file_data);

tester.Run();
}

} // namespace test
} // namespace onnxruntime

0 comments on commit 5ec7ac8

Please sign in to comment.