From 8181be57d0127b4e239681b95f74bca8335484e1 Mon Sep 17 00:00:00 2001 From: Jesse Michel Date: Sun, 19 May 2024 17:00:57 -0400 Subject: [PATCH] update with the file we use for benchmarking --- .../Integration/Sparse/eigen_analysis.cpp | 28 +++---- enzyme/test/Integration/Sparse/matrix.h | 52 ++++++++++-- .../Sparse/ringspring3Drestlengthone.cpp | 84 +++++++++++++++++-- 3 files changed, 136 insertions(+), 28 deletions(-) diff --git a/enzyme/test/Integration/Sparse/eigen_analysis.cpp b/enzyme/test/Integration/Sparse/eigen_analysis.cpp index 434ebcf48ddd..366607d89b04 100644 --- a/enzyme/test/Integration/Sparse/eigen_analysis.cpp +++ b/enzyme/test/Integration/Sparse/eigen_analysis.cpp @@ -101,7 +101,7 @@ static T eigenstuffM(const T *__restrict__ pos0, size_t n, const int *__restrict // Calculate total energy for all faces in 3D template __attribute__((always_inline)) -static T eigenstuffL(const T *__restrict__ x, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ pos0) { +static T eigenstuffL(const T *__restrict__ pos0, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ x) { T sum = 0; __builtin_assume(num_faces != 0); for (size_t idx=0; idx __attribute__((always_inline)) static void gradient_ip(const T *__restrict__ pos0, const size_t num_faces, const int* faces, const T *__restrict__ x, T *__restrict__ out) { - __enzyme_autodiff((void *)eigenstuffM, + __enzyme_autodiff((void *)eigenstuffL, enzyme_const, pos0, enzyme_const, num_faces, enzyme_const, faces, @@ -206,10 +206,10 @@ int main(int argc, char** argv) { // Call eigenstuffM_simple struct timeval start, end; - gettimeofday(&start, NULL); - const float resultM = eigenstuffM(pos0, num_faces, faces, x); - gettimeofday(&end, NULL); - printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end)); + // gettimeofday(&start, NULL); + // const float resultM = eigenstuffM(pos0, num_faces, faces, x); + // gettimeofday(&end, NULL); + // printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end)); // Call eigenstuffL_simple gettimeofday(&start, NULL); @@ -222,10 +222,10 @@ int main(int argc, char** argv) { dx[i] = 0; gradient_ip(pos0, num_faces, faces, x, dx); - if (x_pts < 30) { - for (size_t i=0; i> &triplets) { +static void inner_storeflt(size_t row, size_t col, float val, std::vector> &triplets) { #ifdef BENCHMARK if (val == 0.0) return; #else @@ -26,9 +26,9 @@ static void inner_storeflt(int64_t row, int64_t col, float val, std::vector> &triplets) { +static void inner_storedbl(size_t row, size_t col, double val, std::vector> &triplets) { #ifdef BENCHMARK - if (val == 0.0) return; + if (val > -1e-10 && val < 1e-10) return; #else #warning "Compiling for debug/verfication, performance may be slowed" #endif @@ -37,7 +37,7 @@ static void inner_storedbl(int64_t row, int64_t col, double val, std::vector __attribute__((always_inline)) -static void sparse_store(T val, int64_t idx, size_t i, std::vector> &triplets) { +static void sparse_store(T val, size_t idx, size_t i, std::vector> &triplets) { if (val == 0.0) return; idx /= sizeof(T); if constexpr (sizeof(T) == 4) @@ -48,19 +48,57 @@ static void sparse_store(T val, int64_t idx, size_t i, std::vector> &t template __attribute__((always_inline)) -static T sparse_load(int64_t idx, size_t i, std::vector> &triplets) { +static T sparse_load(size_t idx, size_t i, std::vector> &triplets) { + return 0.0; +} + + +__attribute__((enzyme_sparse_accumulate)) +static void inner_storeflt_modn(size_t row, size_t col, float val, size_t N, std::vector> &triplets) { +#ifdef BENCHMARK + if (val == 0.0) return; +#else +#warning "Compiling for debug/verfication, performance may be slowed" +#endif + triplets.emplace_back(row, col % N, val); +} + +__attribute__((enzyme_sparse_accumulate)) +static void inner_storedbl_modn(size_t row, size_t col, double val, size_t N, std::vector> &triplets) { +#ifdef BENCHMARK + if (val > -1e-10 && val < 1e-10) return; +#else +#warning "Compiling for debug/verfication, performance may be slowed" +#endif + triplets.emplace_back(row, col % N, val); +} + +template +__attribute__((always_inline)) +static void sparse_store_modn(T val, size_t idx, size_t i, size_t N, std::vector> &triplets) { + if (val == 0.0) return; + idx /= sizeof(T); + if constexpr (sizeof(T) == 4) + inner_storeflt_modn(i, idx, val, N, triplets); + else + inner_storedbl_modn(i, idx, val, N, triplets); +} + +template +__attribute__((always_inline)) +static T sparse_load_modn(int64_t idx, size_t i, size_t N, std::vector> &triplets) { return 0.0; } template __attribute__((always_inline)) -static void ident_store(T, int64_t idx, size_t i) { +static void ident_store(T, size_t idx, size_t i) { assert(0 && "should never load"); } template __attribute__((always_inline)) -static T ident_load(int64_t idx, size_t i) { +static T ident_load(size_t idx, size_t i) { idx /= sizeof(T); return (T)(idx == i);// ? 1.0 : 0.0; } diff --git a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp index cae8bdad5708..1c2cd1090962 100644 --- a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp +++ b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp @@ -7,6 +7,9 @@ // TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O2 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi // TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi +// /usr/local/Cellar/llvm@15/15.0.7/bin/clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp -fpass-plugin=/Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -Xclang -load -Xclang /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -mllvm -enzyme-auto-sparsity=1 -o ringspring3Drestlengthone.out + + #include #include #include @@ -21,7 +24,7 @@ __attribute__((always_inline)) static double f(size_t N, T* __restrict__ pos) { double e = 0.; __builtin_assume(N != 0); - for (size_t j = 0; j < N/3; j ++) { + for (size_t j = 0; j < N; j ++) { size_t i = 3 * j; T vx = pos[i]; T vy = pos[i + 1]; @@ -49,9 +52,10 @@ static void never_store(T val, int64_t idx, T* input, size_t N) { assert(0 && "this is a read only input, why are you storing here..."); } +template __attribute__((always_inline)) -static double mod_load(int64_t idx, double* input, size_t N) { - idx /= sizeof(double); +static T mod_load(int64_t idx, T* input, size_t N) { + idx /= sizeof(T); return input[idx % N]; } @@ -59,12 +63,12 @@ template __attribute__((noinline)) std::vector> hess_f(size_t N, T* input) { std::vector> triplets; - // input = __enzyme_todense((void*)mod_load, (void*)never_store, input, N); + input = __enzyme_post_sparse_todense((void*)mod_load, (void*)never_store, input, N); __builtin_assume(N > 0); for (size_t i=0; i((void*)ident_load, (void*)ident_store, i); - T* d_dinput = __enzyme_todense((void*)sparse_load, (void*)sparse_store, i, &triplets); + T* d_dinput = __enzyme_todense((void*)sparse_load_modn, (void*)sparse_store_modn, i, N, &triplets); __enzyme_fwddiff((void*)grad_f, enzyme_const, N, @@ -85,6 +89,62 @@ std::vector hess_f2(size_t N, double* input) { } */ +template +__attribute__((noinline)) +std::vector> handrolled_hess_f(size_t n, T* data) { + std::vector> triplets; + int k = 3 * n; + for (int i = 0; i < n; ++i) { + int row = 3 * i; + double x = data[row % k]; + double y = data[(row + 1) % k]; + double z = data[(row + 2) % k]; + double a = data[(row + 3) % k]; + double b = data[(row + 4) % k]; + double c = data[(row + 5) % k]; + + double f = pow(x - a, 2) + pow(y - b, 2) + pow(z - c, 2); + // double f = (x - a) * (x - a) + (y - b) * (y - b) + (z - c) * (z - c); + double g = 2 * (1 - 1 / sqrt(f)); + + // Diagonal terms + #pragma clang loop unroll(full) + for (int j = 0; j < 6; ++j) { + T val = g; +#ifdef BENCHMARK + if (val > -1e-10 && val < 1e-10) continue; +#endif + triplets.emplace_back((row + j) % k, (row + j) % k, val); + } + + #pragma clang loop unroll(full) + for (int j = 0; j < 6; ++j) { + T val = -g; +#ifdef BENCHMARK + if (val > -1e-10 && val < 1e-10) continue; +#endif + triplets.emplace_back((row + ((j + 3) % 6)) % k, (row + j) % k, val); + } + + // Cross terms + std::vector half_grad_f = {x - a, y - b, z - c, a - x, b - y, c - z}; + // double half_grad_f[] = {x - a, y - b, z - c, a - x, b - y, c - z}; + #pragma clang loop unroll(full) + for (int j = 0; j < 6; ++j) { + #pragma clang loop unroll(full) + for (int l = 0; l < 6; ++l) { + T val = 2 * half_grad_f[j] * half_grad_f[l] * pow(f, -1.5); + // T val = 2 * half_grad_f[j] * half_grad_f[l] / ( f * sqrt(f) ); +#ifdef BENCHMARK + if (val > -1e-10 && val < 1e-10) continue; +#endif + triplets.emplace_back((row + j) % k, (row + l) % k, val); + } + } + } + return triplets; +} + // int argc, char** argv int main(int argc, char** argv) { size_t N = 30; @@ -93,7 +153,7 @@ int main(int argc, char** argv) { N = atoi(argv[1]); } - double *x = (double*)malloc(sizeof(double) * 3 * N); + double *x = (double*)malloc(sizeof(double) * (3 * N + 3)); for (int i = 0; i < N; ++i) { double angle = 2 * M_PI * i / N; x[3 * i] = cos(angle) ;//+ normal(generator); @@ -101,8 +161,10 @@ int main(int argc, char** argv) { x[3 * i + 2] = 0;//normal(generator); } - + for (int i=0; i<10; i++) + { struct timeval start, end; + gettimeofday(&start, NULL); auto res = hess_f(N, x); @@ -118,6 +180,14 @@ int main(int argc, char** argv) { printf("%ld, %ld = %f\n", tup.row, tup.col, tup.val); } + gettimeofday(&start, NULL); + auto hand_res = handrolled_hess_f(N, x); + gettimeofday(&end, NULL); + + printf("Handrolled Number of elements %ld\n", hand_res.size()); + printf(" Runtime %0.6f\n", tdiff(&start, &end)); + } + return 0; }