From 306dedcf5e3867d27ce5851d6ad04373d51f8abc Mon Sep 17 00:00:00 2001
From: Jorg Adam Sowa <jorg.sowa@gmail.com>
Date: Tue, 17 Sep 2024 22:16:26 +0200
Subject: [PATCH] ext/bcmath: bcpow() performance improvement (#15790)

* Added function for squaring to improve performance of power calculation

* Aligned backslashes

* Removed unnecessary comments

* Extracted common part of multiplication and square functions

* Added comment to bc_fast_square

* Improved wording of bc_mul_finish_from_vector

* Reused new function name

* Replaced macro with function
---
 ext/bcmath/libbcmath/src/bcmath.h |   2 +
 ext/bcmath/libbcmath/src/raise.c  |  16 ++--
 ext/bcmath/libbcmath/src/recmul.c | 151 ++++++++++++++++++++++++------
 3 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/bcmath.h b/ext/bcmath/libbcmath/src/bcmath.h
index daf4642e6a323..4b8ca12326417 100644
--- a/ext/bcmath/libbcmath/src/bcmath.h
+++ b/ext/bcmath/libbcmath/src/bcmath.h
@@ -150,6 +150,8 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale);
 	*(result) = mul_ex;                             \
 } while (0)
 
+bc_num bc_square(bc_num n1, size_t scale);
+
 bool bc_divide(bc_num n1, bc_num n2, bc_num *quot, size_t scale);
 
 bool bc_modulo(bc_num num1, bc_num num2, bc_num *resul, size_t scale);
diff --git a/ext/bcmath/libbcmath/src/raise.c b/ext/bcmath/libbcmath/src/raise.c
index a503e77a1a4a4..162f84a830538 100644
--- a/ext/bcmath/libbcmath/src/raise.c
+++ b/ext/bcmath/libbcmath/src/raise.c
@@ -34,13 +34,16 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+void bc_square_ex(bc_num n1, bc_num *result, size_t scale_min) {
+	bc_num square_ex = bc_square(n1, scale_min);
+	bc_free_num(result);
+	*(result) = square_ex;
+}
 
 /* Raise NUM1 to the NUM2 power.  The result is placed in RESULT.
    Maximum exponent is LONG_MAX.  If a NUM2 is not an integer,
    only the integer part is used.  */
-
-void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale)
-{
+void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale) {
 	bc_num temp, power;
 	size_t rscale;
 	size_t pwrscale;
@@ -69,7 +72,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale)
 	pwrscale = num1->n_scale;
 	while ((exponent & 1) == 0) {
 		pwrscale = 2 * pwrscale;
-		bc_multiply_ex(power, power, &power, pwrscale);
+		bc_square_ex(power, &power, pwrscale);
 		exponent = exponent >> 1;
 	}
 	temp = bc_copy_num(power);
@@ -79,7 +82,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale)
 	/* Do the calculation. */
 	while (exponent > 0) {
 		pwrscale = 2 * pwrscale;
-		bc_multiply_ex(power, power, &power, pwrscale);
+		bc_square_ex(power, &power, pwrscale);
 		if ((exponent & 1) == 1) {
 			calcscale = pwrscale + calcscale;
 			bc_multiply_ex(temp, power, &temp, calcscale);
@@ -100,8 +103,7 @@ void bc_raise(bc_num num1, long exponent, bc_num *result, size_t scale)
 }
 
 /* This is used internally by BCMath */
-void bc_raise_bc_exponent(bc_num base, bc_num expo, bc_num *result, size_t scale)
-{
+void bc_raise_bc_exponent(bc_num base, bc_num expo, bc_num *result, size_t scale) {
 	/* Exponent must not have fractional part */
 	assert(expo->n_scale == 0);
 
diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 3341396236e67..b92a1045ac3b5 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -72,6 +72,64 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len,
 	}
 }
 
+/*
+ * Equivalent of bc_fast_mul for small numbers to perform computations
+ * without using array.
+ */
+static inline void bc_fast_square(bc_num n1, size_t n1len, bc_num *prod)
+{
+	const char *n1end = n1->n_value + n1len - 1;
+
+	BC_VECTOR n1_vector = bc_partial_convert_to_vector(n1end, n1len);
+	BC_VECTOR prod_vector = n1_vector * n1_vector;
+
+	size_t prodlen = n1len + n1len;
+	*prod = bc_new_num_nonzeroed(prodlen, 0);
+	char *pptr = (*prod)->n_value;
+	char *pend = pptr + prodlen - 1;
+
+	while (pend >= pptr) {
+		*pend-- = prod_vector % BASE;
+		prod_vector /= BASE;
+	}
+}
+
+/* Common part of functions bc_standard_mul and bc_standard_square
+ * that takes a vector and converts it to a bc_num 	*/
+static inline void bc_mul_finish_from_vector(BC_VECTOR *prod_vector, size_t prod_arr_size, size_t prodlen, bc_num *prod) {
+	/*
+	 * Move a value exceeding 4/8 digits by carrying to the next digit.
+	 * However, the last digit does nothing.
+	 */
+	bc_mul_carry_calc(prod_vector, prod_arr_size);
+
+	/* Convert to bc_num */
+	*prod = bc_new_num_nonzeroed(prodlen, 0);
+	char *pptr = (*prod)->n_value;
+	char *pend = pptr + prodlen - 1;
+	size_t i = 0;
+	while (i < prod_arr_size - 1) {
+#if BC_VECTOR_SIZE == 4
+		bc_write_bcd_representation(prod_vector[i], pend - 3);
+		pend -= 4;
+#else
+		bc_write_bcd_representation(prod_vector[i] / 10000, pend - 7);
+		bc_write_bcd_representation(prod_vector[i] % 10000, pend - 3);
+		pend -= 8;
+#endif
+		i++;
+	}
+
+	/*
+	 * The last digit may carry over.
+	 * Also need to fill it to the end with zeros, so loop until the end of the string.
+	 */
+	while (pend >= pptr) {
+		*pend-- = prod_vector[i] % BASE;
+		prod_vector[i] /= BASE;
+	}
+}
+
 /*
  * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_VECTOR.
  * The array is generated starting with the smaller digits.
@@ -128,42 +186,57 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc
 		}
 	}
 
-	/*
-	 * Move a value exceeding 4/8 digits by carrying to the next digit.
-	 * However, the last digit does nothing.
-	 */
-	bc_mul_carry_calc(prod_vector, prod_arr_size);
+	bc_mul_finish_from_vector(prod_vector, prod_arr_size, prodlen, prod);
 
-	/* Convert to bc_num */
-	*prod = bc_new_num_nonzeroed(prodlen, 0);
-	char *pptr = (*prod)->n_value;
-	char *pend = pptr + prodlen - 1;
-	i = 0;
-	while (i < prod_arr_size - 1) {
-#if BC_VECTOR_SIZE == 4
-		bc_write_bcd_representation(prod_vector[i], pend - 3);
-		pend -= 4;
-#else
-		bc_write_bcd_representation(prod_vector[i] / 10000, pend - 7);
-		bc_write_bcd_representation(prod_vector[i] % 10000, pend - 3);
-		pend -= 8;
-#endif
-		i++;
+	efree(buf);
+}
+
+/** This is bc_standard_mul implementation for square */
+static void bc_standard_square(bc_num n1, size_t n1len, bc_num *prod)
+{
+	size_t i;
+	const char *n1end = n1->n_value + n1len - 1;
+	size_t prodlen = n1len + n1len;
+
+	size_t n1_arr_size = (n1len + BC_VECTOR_SIZE - 1) / BC_VECTOR_SIZE;
+	size_t prod_arr_size = (prodlen + BC_VECTOR_SIZE - 1) / BC_VECTOR_SIZE;
+
+	BC_VECTOR *buf = safe_emalloc(n1_arr_size + n1_arr_size + prod_arr_size, sizeof(BC_VECTOR), 0);
+
+	BC_VECTOR *n1_vector = buf;
+	BC_VECTOR *prod_vector = n1_vector + n1_arr_size + n1_arr_size;
+
+	for (i = 0; i < prod_arr_size; i++) {
+		prod_vector[i] = 0;
 	}
 
-	/*
-	 * The last digit may carry over.
-	 * Also need to fill it to the end with zeros, so loop until the end of the string.
-	 */
-	while (pend >= pptr) {
-		*pend-- = prod_vector[i] % BASE;
-		prod_vector[i] /= BASE;
+	/* Convert to BC_VECTOR[] */
+	bc_convert_to_vector(n1_vector, n1end, n1len);
+
+	/* Multiplication and addition */
+	size_t count = 0;
+	for (i = 0; i < n1_arr_size; i++) {
+		/*
+		 * This calculation adds the result multiple times to the array entries.
+		 * When multiplying large numbers of digits, there is a possibility of
+		 * overflow, so digit adjustment is performed beforehand.
+		 */
+		if (UNEXPECTED(count >= BC_VECTOR_NO_OVERFLOW_ADD_COUNT)) {
+			bc_mul_carry_calc(prod_vector, prod_arr_size);
+			count = 0;
+		}
+		count++;
+		for (size_t j = 0; j < n1_arr_size; j++) {
+			prod_vector[i + j] += n1_vector[i] * n1_vector[j];
+		}
 	}
 
+	bc_mul_finish_from_vector(prod_vector, prod_arr_size, prodlen, prod);
+
 	efree(buf);
 }
 
-/* The multiply routine.  N2 times N1 is put int PROD with the scale of
+/* The multiply routine. N2 times N1 is put int PROD with the scale of
    the result being MIN(N2 scale+N1 scale, MAX (SCALE, N2 scale, N1 scale)).
    */
 
@@ -194,3 +267,25 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale)
 	}
 	return prod;
 }
+
+bc_num bc_square(bc_num n1, size_t scale)
+{
+	bc_num prod;
+
+	size_t len1 = n1->n_len + n1->n_scale;
+	size_t full_scale = n1->n_scale + n1->n_scale;
+	size_t prod_scale = MIN(full_scale, MAX(scale, n1->n_scale));
+
+	if (len1 <= BC_VECTOR_SIZE) {
+		bc_fast_square(n1, len1, &prod);
+	} else {
+		bc_standard_square(n1, len1, &prod);
+	}
+
+	prod->n_sign = PLUS;
+	prod->n_len -= full_scale;
+	prod->n_scale = prod_scale;
+	_bc_rm_leading_zeros(prod);
+
+	return prod;
+}