From 8a57a7a2fa0590f7011de43f225f1342ff924412 Mon Sep 17 00:00:00 2001
From: Mohammad Azim Khan <azim.khan@cambridgeconsultants.com>
Date: Fri, 8 Nov 2024 18:33:13 +0000
Subject: [PATCH] Complex number operations

---
 BUILD                                |   1 +
 CMakeLists.txt                       |   1 +
 g3doc/quick_reference.md             |  26 ++
 hwy/ops/arm_sve-inl.h                | 124 ++++++++++
 hwy/ops/generic_ops-inl.h            |  65 +++++
 hwy/tests/complex_arithmetic_test.cc | 354 +++++++++++++++++++++++++++
 6 files changed, 571 insertions(+)
 create mode 100644 hwy/tests/complex_arithmetic_test.cc
diff --git a/BUILD b/BUILD
index 114eef8a02..b759afa966 100644
--- a/BUILD
+++ b/BUILD
@@ -495,6 +495,7 @@ HWY_TESTS = [
     ("hwy/tests/", "combine_test"),
     ("hwy/tests/", "compare_test"),
     ("hwy/tests/", "compress_test"),
+    ("hwy/tests/", "complex_arithmetic_test"),
     ("hwy/tests/", "concat_test"),
     ("hwy/tests/", "convert_test"),
     ("hwy/tests/", "count_test"),
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cf044cbc9..f490ef6886 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -727,6 +727,7 @@ set(HWY_TEST_FILES
   hwy/tests/cast_test.cc
   hwy/tests/combine_test.cc
   hwy/tests/compare_test.cc
+  hwy/tests/complex_arithmetic_test.cc
   hwy/tests/compress_test.cc
   hwy/tests/concat_test.cc
   hwy/tests/convert_test.cc
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 8220e9b718..b98973802e 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -886,6 +886,32 @@ not a concern, these are equivalent to, and potentially more efficient than,
     <code>V **MaskedSatSubOr**(V no, M m, V a, V b)</code>: returns `a[i] +
     b[i]` saturated to the minimum/maximum representable value, or `no[i]` if
     `m[i]` is false.
+#### Complex number operations
+
+Complex types are represented as complex value pairs of real and imaginary
+components, with the real components in even-indexed lanes and the imaginary
+components in odd-indexed lanes.
+
+All multiplies in this section are performing complex multiplication,
+i.e. `(a + ib)(c + id)`.
+
+Take `j` to be the even values of `i`.
+
+*   <code>V **CplxConj**(V v)</code>: returns the complex conjugate of the vector,
+    this negates the imaginary lanes. This is equivalent to `OddEven(Neg(a), a)`.
+*   <code>V **MulCplx**(V a, V b)</code>: returns `(a[j] + i.a[j + 1])(b[j] + i.b[j + 1])`
+*   <code>V **MulCplxConj**(V a, V b)</code>: returns `(a[j] + i.a[j + 1])(b[j] - i.b[j + 1])`
+*   <code>V **MulCplxAdd**(V a, V b, V c)</code>: returns
+    `(a[j] + i.a[j + 1])(b[j] + i.b[j + 1]) + (c[j] + i.c[j + 1])`
+*   <code>V **MulCplxConjAdd**(V a, V b, V c)</code>: returns
+    `(a[j] + i.a[j + 1])(b[j] - i.b[j + 1]) + (c[j] + i.c[j + 1])`
+*   <code>V **MaskedMulCplxConjAddOrZero**(M mask, V a, V b, V c)</code>: returns
+    `(a[j] + i.a[j + 1])(b[j] - i.b[j + 1]) + (c[j] + i.c[j + 1])` or `0` if
+    `mask[i]` is false.
+*   <code>V **MaskedMulCplxConjOrZero**(M mask, V a, V b)</code>: returns
+    `(a[j] + i.a[j + 1])(b[j] - i.b[j + 1])` or `0` if `mask[i]` is false.
+*   <code>V **MaskedMulCplxOr**(M mask, V a, V b, V c)</code>: returns `(a[j] +
+    i.a[j + 1])(b[j] + i.b[j + 1])` or `c[i]` if `mask[i]` is false.
 
 #### Shifts
 
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 2dde1479de..c38263d185 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -6013,6 +6013,130 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
   return svdot_u64(sum, a, b);
 }
 
+// ------------------------------ MulCplx* / MaskedMulCplx*
+
+// Per-target flag to prevent generic_ops-inl.h from defining MulCplx*.
+#ifdef HWY_NATIVE_CPLX
+#undef HWY_NATIVE_CPLX
+#else
+#define HWY_NATIVE_CPLX
+#endif
+
+template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
+HWY_API V CplxConj(V a) {
+  return OddEven(Neg(a), a);
+}
+
+namespace detail {
+#define HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, ROT)      \
+  HWY_API HWY_SVE_V(BASE, BITS)                                          \
+      NAME##ROT(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b,        \
+                HWY_SVE_V(BASE, BITS) c) {                               \
+    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b, c, ROT); \
+  }                                                                      \
+  HWY_API HWY_SVE_V(BASE, BITS)                                          \
+      NAME##Z##ROT(svbool_t m, HWY_SVE_V(BASE, BITS) a,                  \
+                   HWY_SVE_V(BASE, BITS) b, HWY_SVE_V(BASE, BITS) c) {   \
+    return sv##OP##_##CHAR##BITS##_z(m, a, b, c, ROT);                   \
+  }
+
+#define HWY_SVE_CPLX_FMA(BASE, CHAR, BITS, HALF, NAME, OP)    \
+  HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 0)   \
+  HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 90)  \
+  HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 180) \
+  HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 270)
+
+// Only SVE2 has complex multiply add for integer types
+// and these do not include masked variants
+HWY_SVE_FOREACH_F(HWY_SVE_CPLX_FMA, CplxMulAdd, cmla)
+#undef HWY_SVE_CPLX_FMA
+#undef HWY_SVE_CPLX_FMA_ROT
+}  // namespace detail
+
+template <class V, class M, HWY_IF_FLOAT_V(V)>
+HWY_API V MaskedMulCplxConjAddOrZero(M mask, V a, V b, V c) {
+  return detail::CplxMulAddZ270(mask, detail::CplxMulAddZ0(mask, c, b, a), b,
+                                a);
+}
+
+template <class V, class M, HWY_IF_FLOAT_V(V)>
+HWY_API V MaskedMulCplxConjOrZero(M mask, V a, V b) {
+  return MaskedMulCplxConjAddOrZero(mask, a, b, Zero(DFromV<V>()));
+}
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V MulCplxAdd(V a, V b, V c) {
+  return detail::CplxMulAdd90(detail::CplxMulAdd0(c, a, b), a, b);
+}
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V MulCplx(V a, V b) {
+  return MulCplxAdd(a, b, Zero(DFromV<V>()));
+}
+
+template <class V, class M, HWY_IF_FLOAT_V(V)>
+HWY_API V MaskedMulCplxOr(M mask, V a, V b, V c) {
+  return IfThenElse(mask, MulCplx(a, b), c);
+}
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V MulCplxConjAdd(V a, V b, V c) {
+  return detail::CplxMulAdd270(detail::CplxMulAdd0(c, b, a), b, a);
+}
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V MulCplxConj(V a, V b) {
+  return MulCplxConjAdd(a, b, Zero(DFromV<V>()));
+}
+
+// TODO SVE2 does have intrinsics for integers but not masked variants
+template <class V, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MulCplx(V a, V b) {
+  // a = u + iv, b = x + iy
+  const auto u = DupEven(a);
+  const auto v = DupOdd(a);
+  const auto x = DupEven(b);
+  const auto y = DupOdd(b);
+
+  return OddEven(Add(Mul(u, y), Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
+}
+
+template <class V, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MulCplxConj(V a, V b) {
+  // a = u + iv, b = x + iy
+  const auto u = DupEven(a);
+  const auto v = DupOdd(a);
+  const auto x = DupEven(b);
+  const auto y = DupOdd(b);
+
+  return OddEven(Sub(Mul(v, x), Mul(u, y)), Add(Mul(u, x), Mul(v, y)));
+}
+
+template <class V, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MulCplxAdd(V a, V b, V c) {
+  return Add(MulCplx(a, b), c);
+}
+
+template <class V, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MulCplxConjAdd(V a, V b, V c) {
+  return Add(MulCplxConj(a, b), c);
+}
+
+template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MaskedMulCplxConjAddOrZero(M mask, V a, V b, V c) {
+  return IfThenElseZero(mask, MulCplxConjAdd(a, b, c));
+}
+
+template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MaskedMulCplxConjOrZero(M mask, V a, V b) {
+  return IfThenElseZero(mask, MulCplxConj(a, b));
+}
+
+template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V MaskedMulCplxOr(M mask, V a, V b, V c) {
+  return IfThenElse(mask, MulCplx(a, b), c);
+}
+
 // ------------------------------ AESRound / CLMul
 
 // Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 99b518d99c..ea371220b7 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -4304,6 +4304,71 @@ HWY_API V MulSub(V mul, V x, V sub) {
   return Sub(Mul(mul, x), sub);
 }
 #endif  // HWY_NATIVE_INT_FMA
+// ------------------------------ MulCplx* / MaskedMulCplx*
+
+#if (defined(HWY_NATIVE_CPLX) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_CPLX
+#undef HWY_NATIVE_CPLX
+#else
+#define HWY_NATIVE_CPLX
+#endif
+
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+
+template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
+HWY_API V CplxConj(V a) {
+  return OddEven(Neg(a), a);
+}
+
+template <class V>
+HWY_API V MulCplx(V a, V b) {
+  // a = u + iv, b = x + iy
+  const auto u = DupEven(a);
+  const auto v = DupOdd(a);
+  const auto x = DupEven(b);
+  const auto y = DupOdd(b);
+
+  return OddEven(Add(Mul(u, y), Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
+}
+
+template <class V>
+HWY_API V MulCplxConj(V a, V b) {
+  // a = u + iv, b = x + iy
+  const auto u = DupEven(a);
+  const auto v = DupOdd(a);
+  const auto x = DupEven(b);
+  const auto y = DupOdd(b);
+
+  return OddEven(Sub(Mul(v, x), Mul(u, y)), Add(Mul(u, x), Mul(v, y)));
+}
+
+template <class V>
+HWY_API V MulCplxAdd(V a, V b, V c) {
+  return Add(MulCplx(a, b), c);
+}
+
+template <class V>
+HWY_API V MulCplxConjAdd(V a, V b, V c) {
+  return Add(MulCplxConj(a, b), c);
+}
+
+template <class V, class M>
+HWY_API V MaskedMulCplxConjAddOrZero(M mask, V a, V b, V c) {
+  return IfThenElseZero(mask, MulCplxConjAdd(a, b, c));
+}
+
+template <class V, class M>
+HWY_API V MaskedMulCplxConjOrZero(M mask, V a, V b) {
+  return IfThenElseZero(mask, MulCplxConj(a, b));
+}
+
+template <class V, class M>
+HWY_API V MaskedMulCplxOr(M mask, V a, V b, V c) {
+  return IfThenElse(mask, MulCplx(a, b), c);
+}
+#endif  // HWY_TARGET != HWY_SCALAR
+
+#endif  // HWY_NATIVE_CPLX
 
 // ------------------------------ Integer MulSub / NegMulSub
 #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
diff --git a/hwy/tests/complex_arithmetic_test.cc b/hwy/tests/complex_arithmetic_test.cc
new file mode 100644
index 0000000000..086e8eae17
--- /dev/null
+++ b/hwy/tests/complex_arithmetic_test.cc
@@ -0,0 +1,354 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/complex_arithmetic_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestCplxConj {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a - ib)
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      expected[i + 0] = ConvertScalarTo<T>(a);
+      expected[i + 1] = ConvertScalarTo<T>(-b);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), CplxConj(v1));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllCplxConj() {
+  ForSignedTypes(ForShrinkableVectors<TestCplxConj>());
+  ForFloatTypes(ForShrinkableVectors<TestCplxConj>());
+}
+
+struct TestMulCplx {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c + id)
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      expected[i + 0] = ConvertScalarTo<T>((a * c) - (b * d));
+      expected[i + 1] = ConvertScalarTo<T>((a * d) + (b * c));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulCplx(v1, v2));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMulCplx() {
+  ForAllTypes(ForShrinkableVectors<TestMulCplx>());
+}
+
+struct TestMulCplxAdd {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+    const Vec<D> v3 = Iota(d, 15);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c + id) + e + if
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      auto e = ConvertScalarTo<T>(i + 15);
+      auto f = ConvertScalarTo<T>(i + 15 + 1);
+      expected[i + 0] = ConvertScalarTo<T>((a * c) - (b * d) + e);
+      expected[i + 1] = ConvertScalarTo<T>((a * d) + (b * c) + f);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulCplxAdd(v1, v2, v3));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMulCplxAdd() {
+  ForAllTypes(ForShrinkableVectors<TestMulCplxAdd>());
+}
+
+struct TestMaskedMulCplxOr {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+    const Vec<D> v3 = Iota(d, 15);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c + id)
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      auto e = ConvertScalarTo<T>(i + 15);
+      auto f = ConvertScalarTo<T>(i + 15 + 1);
+      if ((i % 4) ==
+          0) {  // Alternate between masking the real and imaginary lanes
+        bool_lanes[i + 0] = ConvertScalarTo<T>(1);
+        expected[i + 0] = ConvertScalarTo<T>((a * c) - (b * d));
+        bool_lanes[i + 1] = ConvertScalarTo<T>(0);
+        expected[i + 1] = ConvertScalarTo<T>(f);
+      } else {
+        bool_lanes[i + 0] = ConvertScalarTo<T>(0);
+        expected[i + 0] = ConvertScalarTo<T>(e);
+        bool_lanes[i + 1] = ConvertScalarTo<T>(1);
+        expected[i + 1] = ConvertScalarTo<T>((a * d) + (b * c));
+      }
+    }
+
+    const auto mask_i = Load(d, bool_lanes.get());
+    const Mask<D> mask = Gt(mask_i, Zero(d));
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MaskedMulCplxOr(mask, v1, v2, v3));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedMulCplxOr() {
+  ForAllTypes(ForShrinkableVectors<TestMaskedMulCplxOr>());
+}
+
+struct TestMulCplxConj {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c - id)
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      expected[i + 0] = ConvertScalarTo<T>((a * c) + (b * d));
+      expected[i + 1] = ConvertScalarTo<T>((b * c) - (a * d));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulCplxConj(v1, v2));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMulCplxConj() {
+  ForAllTypes(ForShrinkableVectors<TestMulCplxConj>());
+}
+
+struct TestMulCplxConjAdd {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+    const Vec<D> v3 = Iota(d, 15);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c - id) + e + if
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      auto e = ConvertScalarTo<T>(i + 15);
+      auto f = ConvertScalarTo<T>(i + 15 + 1);
+      expected[i + 0] = ConvertScalarTo<T>((e + (c * a)) + (d * b));
+      expected[i + 1] = ConvertScalarTo<T>((f + (c * b)) - (d * a));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulCplxConjAdd(v1, v2, v3));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMulCplxConjAdd() {
+  ForAllTypes(ForShrinkableVectors<TestMulCplxConjAdd>());
+}
+
+struct TestMaskedMulCplxConjOrZero {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c - id)
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      if ((i % 4) ==
+          0) {  // Alternate between masking the real and imaginary lanes
+        bool_lanes[i + 0] = ConvertScalarTo<T>(1);
+        expected[i + 0] = ConvertScalarTo<T>((a * c) + (b * d));
+        bool_lanes[i + 1] = ConvertScalarTo<T>(0);
+        expected[i + 1] = ConvertScalarTo<T>(0);
+      } else {
+        bool_lanes[i + 0] = ConvertScalarTo<T>(0);
+        expected[i + 0] = ConvertScalarTo<T>(0);
+        bool_lanes[i + 1] = ConvertScalarTo<T>(1);
+        expected[i + 1] = ConvertScalarTo<T>((b * c) - (a * d));
+      }
+    }
+
+    const auto mask_i = Load(d, bool_lanes.get());
+    const Mask<D> mask = Gt(mask_i, Zero(d));
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MaskedMulCplxConjOrZero(mask, v1, v2));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedMulCplxConjOrZero() {
+  ForAllTypes(ForShrinkableVectors<TestMaskedMulCplxConjOrZero>());
+}
+
+struct TestMaskedMulCplxConjAddOrZero {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const Vec<D> v1 = Iota(d, 2);
+    const Vec<D> v2 = Iota(d, 10);
+    const Vec<D> v3 = Iota(d, 15);
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    for (size_t i = 0; i < N; i += 2) {
+      // expected = (a + ib)(c - id) + e + if
+      auto a = ConvertScalarTo<T>(i + 2);
+      auto b = ConvertScalarTo<T>(i + 2 + 1);
+      auto c = ConvertScalarTo<T>(i + 10);
+      auto d = ConvertScalarTo<T>(i + 10 + 1);
+      auto e = ConvertScalarTo<T>(i + 15);
+      auto f = ConvertScalarTo<T>(i + 15 + 1);
+      if ((i % 4) ==
+          2) {  // Alternate between masking the real and imaginary lanes
+        bool_lanes[i + 0] = ConvertScalarTo<T>(1);
+        expected[i + 0] = ConvertScalarTo<T>((e + (c * a)) + (d * b));
+        bool_lanes[i + 1] = ConvertScalarTo<T>(0);
+        expected[i + 1] = ConvertScalarTo<T>(0);
+      } else {
+        bool_lanes[i + 0] = ConvertScalarTo<T>(0);
+        expected[i + 0] = ConvertScalarTo<T>(0);
+        bool_lanes[i + 1] = ConvertScalarTo<T>(1);
+        expected[i + 1] = ConvertScalarTo<T>((f + (c * b)) - (d * a));
+      }
+    }
+
+    const auto mask_i = Load(d, bool_lanes.get());
+    const Mask<D> mask = Gt(mask_i, Zero(d));
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(),
+                      MaskedMulCplxConjAddOrZero(mask, v1, v2, v3));
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedMulCplxConjAddOrZero() {
+  ForAllTypes(ForShrinkableVectors<TestMaskedMulCplxConjAddOrZero>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCplxTest);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllCplxConj);
+
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMulCplx);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMulCplxAdd);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMaskedMulCplxOr);
+
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMulCplxConj);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMulCplxConjAdd);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMaskedMulCplxConjOrZero);
+HWY_EXPORT_AND_TEST_P(HwyCplxTest, TestAllMaskedMulCplxConjAddOrZero);
+HWY_AFTER_TEST();
+}  // namespace hwy
+
+#endif