From 8a8109b11a468458776da3e6243b5083a14e5406 Mon Sep 17 00:00:00 2001 From: Martun Karapetyan Date: Fri, 2 Aug 2024 17:01:28 +0400 Subject: [PATCH] Use a macros to unroll montgomery multiplication inner loop. --- .../modular/modular_functions_fixed.hpp | 155 +++++++----------- .../bench_test_modular_adaptor_fixed.cpp | 4 - 2 files changed, 55 insertions(+), 104 deletions(-) diff --git a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp index df37ebe85..a6e64e590 100644 --- a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp +++ b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp @@ -292,6 +292,7 @@ namespace boost { , m_barrett_mu(o.get_mu()) , m_montgomery_r2(o.get_r2()) , m_montgomery_p_dash(o.get_p_dash()) + , m_no_carry_montgomery_mul_allowed(is_applicable_for_no_carry_montgomery_mul()) { } @@ -464,8 +465,7 @@ namespace boost { // bit in the number. I.E. if modulus is 255 bits, then we have 1 additional "unused" bit in the number. // 2. Some other bit in modulus is 0. // 3. The number has < 12 limbs. - return m_mod.internal_limb_count < 12 && (Bits % sizeof(internal_limb_type) != 0) && - !eval_eq(m_mod_compliment, Backend(internal_limb_type(1u))); + return m_mod.internal_limb_count < 12 && (Bits % sizeof(internal_limb_type) != 0) && !eval_eq(m_mod_compliment, Backend(internal_limb_type(1u))); } template @@ -527,67 +527,38 @@ namespace boost { // The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4. std::size_t j = 1; + + #define MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(X) \ + /* "(A,t[X]) := t[X] + a[X]*b[i] + A" */\ + tmp = a_limbs[X]; \ + tmp *= b_limbs[i]; \ + tmp += result_limbs[X]; \ + tmp += A; \ + modular_functions_fixed::dbl_limb_to_limbs( tmp, A, result_limbs[X] ); \ + \ + /* "(C,t[X-1]) := t[X] + m*q[X] + C" */ \ + tmp = m; \ + tmp *= m_mod_limbs[X]; \ + tmp += result_limbs[X]; \ + tmp += C; \ + modular_functions_fixed::dbl_limb_to_limbs( tmp, C, result_limbs[X-1] ); + + for (; j + 5 <= N; j += 5) { + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 1); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 2); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 3); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 4); + } + for (; j + 3 <= N; j += 3) { - // For j - // "(A,t[j]) := t[j] + a[j]*b[i] + A" - tmp = a_limbs[j]; - tmp *= b_limbs[i]; - tmp += result_limbs[j]; - tmp += A; - modular_functions_fixed::dbl_limb_to_limbs( tmp, A, result_limbs[j] ); - - // "(C,t[j-1]) := t[j] + m*q[j] + C" - tmp = m; - tmp *= m_mod_limbs[j]; - tmp += result_limbs[j]; - tmp += C; - modular_functions_fixed::dbl_limb_to_limbs( tmp, C, result_limbs[j-1] ); - - // For j + 1 - // "(A,t[j]) := t[j] + a[j]*b[i] + A" - tmp = a_limbs[j + 1]; - tmp *= b_limbs[i]; - tmp += result_limbs[j + 1]; - tmp += A; - modular_functions_fixed::dbl_limb_to_limbs( tmp, A, result_limbs[j + 1] ); - - // "(C,t[j-1]) := t[j] + m*q[j] + C" - tmp = m; - tmp *= m_mod_limbs[j + 1]; - tmp += result_limbs[j + 1]; - tmp += C; - modular_functions_fixed::dbl_limb_to_limbs( tmp, C, result_limbs[j] ); - - // For j + 2 - // "(A,t[j]) := t[j] + a[j]*b[i] + A" - tmp = a_limbs[j + 2]; - tmp *= b_limbs[i]; - tmp += result_limbs[j + 2]; - tmp += A; - modular_functions_fixed::dbl_limb_to_limbs( tmp, A, result_limbs[j + 2] ); - - // "(C,t[j-1]) := t[j] + m*q[j] + C" - tmp = m; - tmp *= m_mod_limbs[j + 2]; - tmp += result_limbs[j + 2]; - tmp += C; - modular_functions_fixed::dbl_limb_to_limbs( tmp, C, result_limbs[j + 1] ); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 1); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j + 2); } for ( ; j < N; ++j ) { - // "(A,t[j]) := t[j] + a[j]*b[i] + A" - tmp = a_limbs[j]; - tmp *= b_limbs[i]; - tmp += result_limbs[j]; - tmp += A; - modular_functions_fixed::dbl_limb_to_limbs( tmp, A, result_limbs[j] ); - - // "(C,t[j-1]) := t[j] + m*q[j] + C" - tmp = m; - tmp *= m_mod_limbs[j]; - tmp += result_limbs[j]; - tmp += C; - modular_functions_fixed::dbl_limb_to_limbs( tmp, C, result_limbs[j-1] ); + MONTGOMERY_MUL_NO_CARRY_LOOP_BODY(j); } // "t[N-1] = C + A" @@ -604,6 +575,7 @@ namespace boost { template BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_CIOS_impl(Backend1 &result, const Backend1 &y, std::integral_constant const&) const { + BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod)); Backend A(internal_limb_type(0u)); @@ -648,51 +620,34 @@ namespace boost { // The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4. internal_double_limb_type t = 0, t2 = 0; + + #define MONTGOMERY_MUL_CIOS_LOOP_BODY(X) \ + t = static_cast(y_limbs[X]) * \ + static_cast(x_i) + \ + A_limbs[X] + k; \ + t2 = static_cast(mod_limbs[X]) * \ + static_cast(u_i) + \ + static_cast(t) + k2; \ + A_limbs[X - 1] = static_cast(t2); \ + k = static_cast(t >> std::numeric_limits::digits); \ + k2 = static_cast(t2 >> std::numeric_limits::digits); + + for (; j + 5 <= mod_size; j += 5) { + MONTGOMERY_MUL_CIOS_LOOP_BODY(j); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 1); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 2); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 3); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 4); + } + for (; j + 3 <= mod_size; j += 3) { - // For j - t = static_cast(y_limbs[j]) * - static_cast(x_i) + - A_limbs[j] + k; - t2 = static_cast(mod_limbs[j]) * - static_cast(u_i) + - static_cast(t) + k2; - A_limbs[j - 1] = static_cast(t2); - k = static_cast(t >> std::numeric_limits::digits); - k2 = static_cast(t2 >> std::numeric_limits::digits); - - // For j + 1 - t = static_cast(y_limbs[j + 1]) * - static_cast(x_i) + - A_limbs[j + 1] + k; - t2 = static_cast(mod_limbs[j + 1]) * - static_cast(u_i) + - static_cast(t) + k2; - A_limbs[j + 1 - 1] = static_cast(t2); - k = static_cast(t >> std::numeric_limits::digits); - k2 = static_cast(t2 >> std::numeric_limits::digits); - - // For j + 2 - t = static_cast(y_limbs[j + 2]) * - static_cast(x_i) + - A_limbs[j + 2] + k; - t2 = static_cast(mod_limbs[j + 2]) * - static_cast(u_i) + - static_cast(t) + k2; - A_limbs[j + 2 - 1] = static_cast(t2); - k = static_cast(t >> std::numeric_limits::digits); - k2 = static_cast(t2 >> std::numeric_limits::digits); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 1); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j + 2); } for (; j < mod_size; ++j) { - t = static_cast(y_limbs[j]) * - static_cast(x_i) + - A_limbs[j] + k; - t2 = static_cast(mod_limbs[j]) * - static_cast(u_i) + - static_cast(t) + k2; - A_limbs[j - 1] = static_cast(t2); - k = static_cast(t >> std::numeric_limits::digits); - k2 = static_cast(t2 >> std::numeric_limits::digits); + MONTGOMERY_MUL_CIOS_LOOP_BODY(j); } internal_double_limb_type tmp = diff --git a/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp b/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp index 02ba41975..8fa801015 100644 --- a/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp +++ b/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp @@ -75,7 +75,6 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_montgomery_mult_perf_test) { << std::dec << elapsed.count() / SAMPLES << " ns" << std::endl; } -// Averge subtraction time is 37 ns. BOOST_AUTO_TEST_CASE(modular_adaptor_backend_sub_perf_test) { using namespace boost::multiprecision::default_ops; @@ -107,7 +106,6 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_backend_sub_perf_test) { << std::dec << elapsed.count() / SAMPLES << " ns" << std::endl; } -// Averge addition time is 37 ns. BOOST_AUTO_TEST_CASE(modular_adaptor_backend_add_perf_test) { using namespace boost::multiprecision::default_ops; @@ -139,7 +137,6 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_backend_add_perf_test) { << std::dec << elapsed.count() / SAMPLES << " ns" << std::endl; } -// Averge multiplication time is 130 ns. BOOST_AUTO_TEST_CASE(modular_adaptor_backend_mult_perf_test) { using Backend = cpp_int_modular_backend<256>; using standart_number = boost::multiprecision::number; @@ -170,7 +167,6 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_backend_mult_perf_test) { std::cout << x_modular << std::endl; } -// Averge multiplication time is 130 ns. BOOST_AUTO_TEST_CASE(modular_adaptor_number_mult_perf_test) { using Backend = cpp_int_modular_backend<256>; using standart_number = boost::multiprecision::number;