From 306dedcf5e3867d27ce5851d6ad04373d51f8abc Mon Sep 17 00:00:00 2001 From: Jorg Adam Sowa Date: Tue, 17 Sep 2024 22:16:26 +0200 Subject: [PATCH] ext/bcmath: bcpow() performance improvement (#15790) * Added function for squaring to improve performance of power calculation * Aligned backslashes * Removed unnecessary comments * Extracted common part of multiplication and square functions * Added comment to bc_fast_square * Improved wording of bc_mul_finish_from_vector * Reused new function name * Replaced macro with function --- ext/bcmath/libbcmath/src/bcmath.h | 2 + ext/bcmath/libbcmath/src/raise.c | 16 ++-- ext/bcmath/libbcmath/src/recmul.c | 151 ++++++++++++++++++++++++------ 3 files changed, 134 insertions(+), 35 deletions(-) diff --git a/ext/bcmath/libbcmath/src/bcmath.h b/ext/bcmath/libbcmath/src/bcmath.h index daf4642e6a323..4b8ca12326417 100644 --- a/ext/bcmath/libbcmath/src/bcmath.h +++ b/ext/bcmath/libbcmath/src/bcmath.h @@ -150,6 +150,8 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale); *(result) = mul_ex; \ } while (0) +bc_num bc_square(bc_num n1, size_t scale); + bool bc_divide(bc_num n1, bc_num n2, bc_num *quot, size_t scale); bool bc_modulo(bc_num num1, bc_num num2, bc_num *resul, size_t scale); diff --git a/ext/bcmath/libbcmath/src/raise.c b/ext/bcmath/libbcmath/src/raise.c index a503e77a1a4a4..162f84a830538 100644 --- a/ext/bcmath/libbcmath/src/raise.c +++ b/ext/bcmath/libbcmath/src/raise.c @@ -34,13 +34,16 @@ #include #include +void bc_square_ex(bc_num n1, bc_num *result, size_t scale_min) { + bc_num square_ex = bc_square(n1, scale_min); + bc_free_num(result); + *(result) = square_ex; +} /* Raise NUM1 to the NUM2 power. The result is placed in RESULT. Maximum exponent is LONG_MAX. If a NUM2 is not an integer, only the integer part is used. */ - -void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) -{ +void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) { bc_num temp, power; size_t rscale; size_t pwrscale; @@ -69,7 +72,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) pwrscale = num1->n_scale; while ((exponent & 1) == 0) { pwrscale = 2 * pwrscale; - bc_multiply_ex(power, power, &power, pwrscale); + bc_square_ex(power, &power, pwrscale); exponent = exponent >> 1; } temp = bc_copy_num(power); @@ -79,7 +82,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) /* Do the calculation. */ while (exponent > 0) { pwrscale = 2 * pwrscale; - bc_multiply_ex(power, power, &power, pwrscale); + bc_square_ex(power, &power, pwrscale); if ((exponent & 1) == 1) { calcscale = pwrscale + calcscale; bc_multiply_ex(temp, power, &temp, calcscale); @@ -100,8 +103,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) } /* This is used internally by BCMath */ -void bc_raise_bc_exponent(bc_num base, bc_num expo, bc_num *result, size_t scale) -{ +void bc_raise_bc_exponent(bc_num base, bc_num expo, bc_num *result, size_t scale) { /* Exponent must not have fractional part */ assert(expo->n_scale == 0); diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c index 3341396236e67..b92a1045ac3b5 100644 --- a/ext/bcmath/libbcmath/src/recmul.c +++ b/ext/bcmath/libbcmath/src/recmul.c @@ -72,6 +72,64 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, } } +/* + * Equivalent of bc_fast_mul for small numbers to perform computations + * without using array. + */ +static inline void bc_fast_square(bc_num n1, size_t n1len, bc_num *prod) +{ + const char *n1end = n1->n_value + n1len - 1; + + BC_VECTOR n1_vector = bc_partial_convert_to_vector(n1end, n1len); + BC_VECTOR prod_vector = n1_vector * n1_vector; + + size_t prodlen = n1len + n1len; + *prod = bc_new_num_nonzeroed(prodlen, 0); + char *pptr = (*prod)->n_value; + char *pend = pptr + prodlen - 1; + + while (pend >= pptr) { + *pend-- = prod_vector % BASE; + prod_vector /= BASE; + } +} + +/* Common part of functions bc_standard_mul and bc_standard_square + * that takes a vector and converts it to a bc_num */ +static inline void bc_mul_finish_from_vector(BC_VECTOR *prod_vector, size_t prod_arr_size, size_t prodlen, bc_num *prod) { + /* + * Move a value exceeding 4/8 digits by carrying to the next digit. + * However, the last digit does nothing. + */ + bc_mul_carry_calc(prod_vector, prod_arr_size); + + /* Convert to bc_num */ + *prod = bc_new_num_nonzeroed(prodlen, 0); + char *pptr = (*prod)->n_value; + char *pend = pptr + prodlen - 1; + size_t i = 0; + while (i < prod_arr_size - 1) { +#if BC_VECTOR_SIZE == 4 + bc_write_bcd_representation(prod_vector[i], pend - 3); + pend -= 4; +#else + bc_write_bcd_representation(prod_vector[i] / 10000, pend - 7); + bc_write_bcd_representation(prod_vector[i] % 10000, pend - 3); + pend -= 8; +#endif + i++; + } + + /* + * The last digit may carry over. + * Also need to fill it to the end with zeros, so loop until the end of the string. + */ + while (pend >= pptr) { + *pend-- = prod_vector[i] % BASE; + prod_vector[i] /= BASE; + } +} + /* * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_VECTOR. * The array is generated starting with the smaller digits. @@ -128,42 +186,57 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc } } - /* - * Move a value exceeding 4/8 digits by carrying to the next digit. - * However, the last digit does nothing. - */ - bc_mul_carry_calc(prod_vector, prod_arr_size); + bc_mul_finish_from_vector(prod_vector, prod_arr_size, prodlen, prod); - /* Convert to bc_num */ - *prod = bc_new_num_nonzeroed(prodlen, 0); - char *pptr = (*prod)->n_value; - char *pend = pptr + prodlen - 1; - i = 0; - while (i < prod_arr_size - 1) { -#if BC_VECTOR_SIZE == 4 - bc_write_bcd_representation(prod_vector[i], pend - 3); - pend -= 4; -#else - bc_write_bcd_representation(prod_vector[i] / 10000, pend - 7); - bc_write_bcd_representation(prod_vector[i] % 10000, pend - 3); - pend -= 8; -#endif - i++; + efree(buf); +} + +/** This is bc_standard_mul implementation for square */ +static void bc_standard_square(bc_num n1, size_t n1len, bc_num *prod) +{ + size_t i; + const char *n1end = n1->n_value + n1len - 1; + size_t prodlen = n1len + n1len; + + size_t n1_arr_size = (n1len + BC_VECTOR_SIZE - 1) / BC_VECTOR_SIZE; + size_t prod_arr_size = (prodlen + BC_VECTOR_SIZE - 1) / BC_VECTOR_SIZE; + + BC_VECTOR *buf = safe_emalloc(n1_arr_size + n1_arr_size + prod_arr_size, sizeof(BC_VECTOR), 0); + + BC_VECTOR *n1_vector = buf; + BC_VECTOR *prod_vector = n1_vector + n1_arr_size + n1_arr_size; + + for (i = 0; i < prod_arr_size; i++) { + prod_vector[i] = 0; } - /* - * The last digit may carry over. - * Also need to fill it to the end with zeros, so loop until the end of the string. - */ - while (pend >= pptr) { - *pend-- = prod_vector[i] % BASE; - prod_vector[i] /= BASE; + /* Convert to BC_VECTOR[] */ + bc_convert_to_vector(n1_vector, n1end, n1len); + + /* Multiplication and addition */ + size_t count = 0; + for (i = 0; i < n1_arr_size; i++) { + /* + * This calculation adds the result multiple times to the array entries. + * When multiplying large numbers of digits, there is a possibility of + * overflow, so digit adjustment is performed beforehand. + */ + if (UNEXPECTED(count >= BC_VECTOR_NO_OVERFLOW_ADD_COUNT)) { + bc_mul_carry_calc(prod_vector, prod_arr_size); + count = 0; + } + count++; + for (size_t j = 0; j < n1_arr_size; j++) { + prod_vector[i + j] += n1_vector[i] * n1_vector[j]; + } } + bc_mul_finish_from_vector(prod_vector, prod_arr_size, prodlen, prod); + efree(buf); } -/* The multiply routine. N2 times N1 is put int PROD with the scale of +/* The multiply routine. N2 times N1 is put int PROD with the scale of the result being MIN(N2 scale+N1 scale, MAX (SCALE, N2 scale, N1 scale)). */ @@ -194,3 +267,25 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale) } return prod; } + +bc_num bc_square(bc_num n1, size_t scale) +{ + bc_num prod; + + size_t len1 = n1->n_len + n1->n_scale; + size_t full_scale = n1->n_scale + n1->n_scale; + size_t prod_scale = MIN(full_scale, MAX(scale, n1->n_scale)); + + if (len1 <= BC_VECTOR_SIZE) { + bc_fast_square(n1, len1, &prod); + } else { + bc_standard_square(n1, len1, &prod); + } + + prod->n_sign = PLUS; + prod->n_len -= full_scale; + prod->n_scale = prod_scale; + _bc_rm_leading_zeros(prod); + + return prod; +}