Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added mlir->LLVMIR conversion for tanh op using LUT #1718

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aie_runtime_lib/AIE2/lut_based_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ alignas(aie::vector_decl_align) unsigned char m_inv_lut[128] = {

// Tanh look up tables: Divides into 32 segments between [-4,4], bank size:
// (32*2*2*4)*2=1k, one lut=512B
float chess_storage(% chess_alignof(v32int8)) tanh_lut_ab[128] = {
alignas(aie::vector_decl_align) float tanh_lut_ab[128] = {
0.00000000000000000000000000000000, -1.00000000000000000000000000000000,
0.00283813476562500000000000000000, -0.98828125000000000000000000000000,
0.00000000000000000000000000000000, -1.00000000000000000000000000000000,
Expand Down Expand Up @@ -295,7 +295,7 @@ float chess_storage(% chess_alignof(v32int8)) tanh_lut_ab[128] = {
0.00000000000000000000000000000000, 1.00000000000000000000000000000000,
};

float chess_storage(% chess_alignof(v32int8)) tanh_lut_cd[128] = {
alignas(aie::vector_decl_align) float tanh_lut_cd[128] = {
0.00000000000000000000000000000000, -1.00000000000000000000000000000000,
0.00283813476562500000000000000000, -0.98828125000000000000000000000000,
0.00000000000000000000000000000000, -1.00000000000000000000000000000000,
Expand Down
4 changes: 2 additions & 2 deletions aie_runtime_lib/AIE2/lut_based_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ __attribute__((always_inline)) bfloat16 getInvBf16(float x) {
return *inv_x;
}

extern float tanh_lut_ab[];
extern float tanh_lut_cd[];
alignas(aie::vector_decl_align) extern float tanh_lut_ab[128];
alignas(aie::vector_decl_align) extern float tanh_lut_cd[128];
Comment on lines +84 to +85
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any benefit to having the alignment and size of these arrays declared here?


__attribute__((always_inline)) v16bfloat16 getTanhBf16(v16bfloat16 vInput) {
aie::vector<bfloat16, 16> input = vInput;
Expand Down
55 changes: 43 additions & 12 deletions lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,22 @@ static bool matchExpOpForLUT(math::ExpOp::Adaptor adaptor) {
return true;
}

static bool matchTanhOpForLUT(math::TanhOp tanhOp) {
auto srcType = dyn_cast<VectorType>(tanhOp.getOperand().getType());
if (!srcType)
return false;

Type scalarType = srcType.getElementType();
if (!isa<FloatType>(scalarType))
return false;

unsigned laneSize = getVectorLaneSize(srcType);
unsigned elWidth = scalarType.getIntOrFloatBitWidth();
if (elWidth != 16 || laneSize != 16)
return false;

return true;
}
//===----------------------------------------------------------------------===//
// Rewrite patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -2064,24 +2080,38 @@ struct ComputeInvOpByLUTPattern : OpConversionPattern<arith::DivFOp> {
}
};

// Convert math.tanh to a function call to compute tanh(x) by look up tables
struct ComputeTanhOpByLUTPattern : OpConversionPattern<math::TanhOp> {
struct ComputeTanhOpByLUTLLVMPattern : OpConversionPattern<math::TanhOp> {
using OpConversionPattern::OpConversionPattern;

LogicalResult
matchAndRewrite(math::TanhOp tanhOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto srcType = dyn_cast<VectorType>(tanhOp.getOperand().getType());
if (!srcType)
if (!matchTanhOpForLUT(tanhOp))
return failure();

Type scalarType = srcType.getElementType();
if (!isa<FloatType>(scalarType))
return failure();
StringRef funcName = "getTanhBf16";
auto moduleOp = tanhOp->getParentOfType<mlir::ModuleOp>();
VectorType v16bf16Ty = mlir::VectorType::get({16}, rewriter.getBF16Type());

unsigned laneSize = getVectorLaneSize(srcType);
unsigned elWidth = scalarType.getIntOrFloatBitWidth();
if (elWidth != 16 || laneSize != 16)
func::FuncOp fn_op =
getOrInsertFuncDecl(rewriter, moduleOp, funcName, TypeRange{v16bf16Ty},
TypeRange{v16bf16Ty});

rewriter.setInsertionPoint(tanhOp);
SmallVector<Value> tanhOperands = {adaptor.getOperand()};
rewriter.replaceOpWithNewOp<func::CallOp>(tanhOp, fn_op, tanhOperands);

return success();
}
};
// Convert math.tanh to a function call to compute tanh(x) by look up tables
struct ComputeTanhOpByLUTPattern : OpConversionPattern<math::TanhOp> {
using OpConversionPattern::OpConversionPattern;

LogicalResult
matchAndRewrite(math::TanhOp tanhOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
if (!matchTanhOpForLUT(tanhOp))
return failure();

StringRef includeName = "lut_based_ops.h";
Expand Down Expand Up @@ -3095,19 +3125,20 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
>(patterns.getContext(), 128, 1024, 256, 1024);
patterns.add<
ComputeExpOpByLUTPattern,
ComputeTanhOpByLUTPattern,
LowerVectorAddFOpToAIEVecAddElemOp,
LowerVectorSubFOpToAIEVecSubElemOp,
LowerVectorAddIOpToAIEVecAddElemOp,
LowerVectorSubIOpToAIEVecSubElemOp
>(patterns.getContext());
} else if (backend == TargetBackend::LLVMIR){
patterns.add<
ComputeExpOpByLUTLLVMPattern
ComputeExpOpByLUTLLVMPattern,
ComputeTanhOpByLUTLLVMPattern
>(patterns.getContext());
}
patterns.add<
ComputeInvOpByLUTPattern,
ComputeTanhOpByLUTPattern,
ComputeSqrtOpPattern,
ComputeRsqrtOpPattern,
ComputeErfOpPattern,
Expand Down
13 changes: 13 additions & 0 deletions test/Conversion/VectorToAIEVec/test_tanh_lut.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aie2 target-backend=llvmir" | FileCheck %s

// CHECK-LABEL: func private @getTanhBf16(vector<16xbf16>) -> vector<16xbf16>
// CHECK-LABEL: func @test_tanh_lut
// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xbf16>
module{
func.func @test_tanh_lut(%a: vector<16xbf16>) -> vector<16xbf16> {
// CHECK: %[[CALL:.*]] = call @getTanhBf16(%[[A]]) : (vector<16xbf16>) -> vector<16xbf16>
%0 = math.tanh %a : vector<16xbf16>
// CHECK: return %[[CALL]] : vector<16xbf16>
return %0 : vector<16xbf16>
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// REQUIRES: peano, peano_and_chess
// RUN: mkdir -p %t/data; cd %t
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
// RUN: aie-translate --mlir-to-llvmir llvmir.mlir -o dut_part.ll
// RUN: %PEANO_INSTALL_DIR/bin/clang -S -emit-llvm %clang_aie2_lib_args -I%aie_runtime_lib%/AIE2/ -c %S/dut_simple.cc -o lut_based_ops.ll
// RUN: %PEANO_INSTALL_DIR/bin/clang -S -emit-llvm %clang_aie2_lib_args -c %aie_runtime_lib%/AIE2/lut_based_ops.cpp -o lut_constants.ll
// RUN: llvm-link -S lut_based_ops.ll dut_part.ll -o dut_functions.ll
// RUN: llvm-link -S lut_constants.ll dut_functions.ll -o dut.ll
// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -DTO_LLVM -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.o
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: memref<1024xbf16>{llvm.noalias}, %arg1: memref<1024xbf16>{llvm.noalias}) {
memref.assume_alignment %arg0, 32 : memref<1024xbf16>
memref.assume_alignment %arg1, 32 : memref<1024xbf16>
affine.for %arg3 = 0 to 1024 {
%0 = affine.load %arg0[%arg3] : memref<1024xbf16>
%1 = math.tanh %0 : bf16
affine.store %1, %arg1[%arg3] : memref<1024xbf16>
}
return
}
}
23 changes: 23 additions & 0 deletions test/unit_tests/aievec_tests/bf16_tanh_lut/bf16_tanh_lut.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// RUN: mkdir -p %t/data; cd %t
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aie2" -lower-affine | aie-translate -aie2=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. -c %aie_runtime_lib%/AIE2/lut_based_ops.cpp -o lut_based_ops.o
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. -c dut.cc -o dut.o
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc work/dut.o work/lut_based_ops.o
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
affine.for %arg3 = 0 to 1024 {
%0 = affine.load %arg0[%arg3] : memref<1024xbf16>
%1 = math.tanh %0 : bf16
affine.store %1, %arg1[%arg3] : memref<1024xbf16>
}
return
}
}
3 changes: 3 additions & 0 deletions test/unit_tests/aievec_tests/bf16_tanh_lut/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
17 changes: 17 additions & 0 deletions test/unit_tests/aievec_tests/bf16_tanh_lut/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "lut_based_ops.h"
void dut(bfloat16 * restrict v1, bfloat16 * restrict v2) {
size_t v3 = 0;
size_t v4 = 1024;
size_t v5 = 16;
for (size_t v6 = v3; v6 < v4; v6 += v5)
chess_prepare_for_pipelining
chess_loop_range(64, 64)
{
v16bfloat16 v7 = *(v16bfloat16 *)(v1 + v6);
v16bfloat16 v8 = getTanhBf16(v7);
*(v16bfloat16 *)(v2 + v6) = v8;
}
return;
}


Comment on lines +16 to +17
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit.: too many blank lines

1 change: 1 addition & 0 deletions test/unit_tests/aievec_tests/bf16_tanh_lut/dut_simple.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#include "lut_based_ops.h"
64 changes: 64 additions & 0 deletions test/unit_tests/aievec_tests/bf16_tanh_lut/testbench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

#ifdef TO_LLVM
extern "C" {
#endif
void dut(bfloat16 *restrict in0, bfloat16 *restrict out0);
#ifdef TO_LLVM
}
#endif

void dut_ref(bfloat16 *in0, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-4, 1, 3); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
float in = in0[k];
//implements by tanh(x) = (e^(2x)-1)/(e^(2x)+1)
//tanh is not in libcxx-lite
float temp_var = expf(2*in);
float out = (temp_var-1)/(temp_var+1);
out0[k] = (bfloat16)out;
}
}
Loading