From 91edfbde63f3cfd04c90f53631c67b10049be9c9 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 09:56:44 -0400
Subject: [PATCH 1/7] wip

---
 Cargo.toml                                    |   1 -
 arith/Cargo.toml                              |   2 -
 arith/src/extension_field.rs                  |   4 -
 arith/src/extension_field/gf2_127.rs          |   9 -
 arith/src/extension_field/gf2_127/avx.rs      | 367 --------------
 arith/src/extension_field/gf2_127/neon.rs     |   0
 arith/src/extension_field/gf2_128.rs          |  13 +-
 .../gf2_128/{avx.rs => avx512.rs}             |   0
 arith/src/extension_field/gf2_128x4/avx256.rs | 466 ------------------
 arith/src/extension_field/gf2_128x8.rs        |  15 +-
 .../gf2_128x8/{avx.rs => avx512.rs}           |   0
 arith/src/field/m31.rs                        |   7 +-
 .../field/m31/{m31_avx.rs => m31_avx512.rs}   |   0
 arith/src/field/m31/m31x16.rs                 |  14 +-
 14 files changed, 26 insertions(+), 872 deletions(-)
 delete mode 100644 arith/src/extension_field/gf2_127.rs
 delete mode 100644 arith/src/extension_field/gf2_127/avx.rs
 delete mode 100644 arith/src/extension_field/gf2_127/neon.rs
 rename arith/src/extension_field/gf2_128/{avx.rs => avx512.rs} (100%)
 delete mode 100644 arith/src/extension_field/gf2_128x4/avx256.rs
 rename arith/src/extension_field/gf2_128x8/{avx.rs => avx512.rs} (100%)
 rename arith/src/field/m31/{m31_avx.rs => m31_avx512.rs} (100%)
diff --git a/Cargo.toml b/Cargo.toml
index 4e78d72d..b052b802 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -43,7 +43,6 @@ path = "src/utils.rs"
 default = []
 # default = [ "grinding" ]
 grinding = []
-avx256 = ["arith/avx256"]
 
 [workspace]
 members = ["arith", "bi-kzg"]
diff --git a/arith/Cargo.toml b/arith/Cargo.toml
index d05aaed5..f31a0bb0 100644
--- a/arith/Cargo.toml
+++ b/arith/Cargo.toml
@@ -27,5 +27,3 @@ harness = false
 name = "ext_field"
 harness = false
 
-[features]
-avx256 = []
diff --git a/arith/src/extension_field.rs b/arith/src/extension_field.rs
index e0f3547b..36a9c30c 100644
--- a/arith/src/extension_field.rs
+++ b/arith/src/extension_field.rs
@@ -1,16 +1,12 @@
 mod fr_ext;
-// mod gf2_127;
 mod gf2_128;
 mod gf2_128x8;
 mod m31_ext;
 mod m31_ext3x16;
 use crate::{Field, FieldSerde};
 
-// pub use gf2_127::*;
 pub use gf2_128::*;
 pub use gf2_128x8::GF2_128x8;
-#[cfg(target_arch = "x86_64")]
-pub use gf2_128x8::GF2_128x8_256;
 pub use m31_ext::M31Ext3;
 pub use m31_ext3x16::M31Ext3x16;
 
diff --git a/arith/src/extension_field/gf2_127.rs b/arith/src/extension_field/gf2_127.rs
deleted file mode 100644
index 306fa594..00000000
--- a/arith/src/extension_field/gf2_127.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-#[cfg(target_arch = "aarch64")]
-pub(crate) mod neon;
-#[cfg(target_arch = "aarch64")]
-pub type GF2_127 = neon::NeonGF2_127;
-
-#[cfg(target_arch = "x86_64")]
-mod avx;
-#[cfg(target_arch = "x86_64")]
-pub type GF2_127 = avx::AVX512GF2_127;
diff --git a/arith/src/extension_field/gf2_127/avx.rs b/arith/src/extension_field/gf2_127/avx.rs
deleted file mode 100644
index 75480457..00000000
--- a/arith/src/extension_field/gf2_127/avx.rs
+++ /dev/null
@@ -1,367 +0,0 @@
-use std::iter::{Product, Sum};
-use std::{
-    arch::x86_64::*,
-    mem::transmute,
-    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
-};
-
-use crate::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, GF2};
-
-#[derive(Debug, Clone, Copy)]
-pub struct AVX512GF2_127 {
-    pub v: __m128i,
-}
-
-field_common!(AVX512GF2_127);
-
-impl FieldSerde for AVX512GF2_127 {
-    const SERIALIZED_SIZE: usize = 16;
-
-    #[inline(always)]
-    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
-        unsafe { writer.write_all(transmute::<__m128i, [u8; 16]>(self.v).as_ref())? };
-        Ok(())
-    }
-
-    #[inline(always)]
-    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut u = [0u8; Self::SERIALIZED_SIZE];
-        reader.read_exact(&mut u)?;
-        u[Self::SERIALIZED_SIZE - 1] &= 0x7F; // Should we do a modular operation here?
-
-        unsafe {
-            Ok(AVX512GF2_127 {
-                v: transmute::<[u8; Self::SERIALIZED_SIZE], __m128i>(u),
-            })
-        }
-    }
-
-    #[inline(always)]
-    fn try_deserialize_from_ecc_format<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut u = [0u8; 32];
-        reader.read_exact(&mut u)?;
-        assert!(u[15] <= 0x7F); // and ignoring 16 - 31
-        Ok(unsafe {
-            AVX512GF2_127 {
-                v: transmute::<[u8; 16], __m128i>(u[..16].try_into().unwrap()),
-            }
-        })
-    }
-}
-
-// mod x^127 + x + 1
-impl Field for AVX512GF2_127 {
-    const NAME: &'static str = "Galios Field 2^127";
-
-    const SIZE: usize = 128 / 8;
-
-    const FIELD_SIZE: usize = 127; // in bits
-
-    const ZERO: Self = AVX512GF2_127 {
-        v: unsafe { std::mem::zeroed() },
-    };
-
-    const ONE: Self = AVX512GF2_127 {
-        v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) },
-    };
-
-    const INV_2: Self = AVX512GF2_127 {
-        v: unsafe { std::mem::zeroed() },
-    }; // should not be used
-
-    #[inline(always)]
-    fn zero() -> Self {
-        AVX512GF2_127 {
-            v: unsafe { std::mem::zeroed() },
-        }
-    }
-
-    #[inline(always)]
-    fn one() -> Self {
-        AVX512GF2_127 {
-            v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) }, 
-        }
-    }
-
-    #[inline(always)]
-    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
-        let mut u = [0u8; 16];
-        rng.fill_bytes(&mut u);
-        u[15] &= 0x7F;
-        unsafe {
-            AVX512GF2_127 {
-                v: *(u.as_ptr() as *const __m128i),
-            }
-        }
-    }
-
-    #[inline(always)]
-    fn random_bool(mut rng: impl rand::RngCore) -> Self {
-        AVX512GF2_127 {
-            v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([rng.next_u32() % 2, 0, 0, 0]) },
-        }
-    }
-
-    #[inline(always)]
-    fn is_zero(&self) -> bool {
-        unsafe { std::mem::transmute::<__m128i, [u8; 16]>(self.v) == [0; 16] }
-    }
-
-    #[inline(always)]
-    fn exp(&self, exponent: u128) -> Self {
-        let mut e = exponent;
-        let mut res = Self::one();
-        let mut t = *self;
-        while e > 0 {
-            if e & 1 == 1 {
-                res *= t;
-            }
-            t = t * t;
-            e >>= 1;
-        }
-        res
-    }
-
-    #[inline(always)]
-    fn inv(&self) -> Option<Self> {
-        if self.is_zero() {
-            return None;
-        }
-        let p_m2 = (1u128 << 127) - 2;
-        Some(Self::exp(self, p_m2))
-    }
-
-    #[inline(always)]
-    fn square(&self) -> Self {
-        self * self
-    }
-
-    #[inline(always)]
-    fn as_u32_unchecked(&self) -> u32 {
-        unimplemented!("u32 for GF127 doesn't make sense")
-    }
-
-    #[inline(always)]
-    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
-        let mut bytes = bytes.clone();
-        bytes[15] &= 0x7F;
-
-        unsafe {
-            AVX512GF2_127 {
-                v: transmute::<[u8; 16], __m128i>(bytes[..16].try_into().unwrap()),
-            }
-        }
-    }
-}
-
-impl ExtensionField for AVX512GF2_127 {
-    const DEGREE: usize = 127;
-
-    const W: u32 = 0x87;
-
-    const X: Self = AVX512GF2_127 {
-        v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([2, 0, 0, 0]) },
-    };
-
-    type BaseField = GF2;
-
-    #[inline(always)]
-    fn mul_by_base_field(&self, base: &Self::BaseField) -> Self {
-        if base.v == 0 {
-            Self::zero()
-        } else {
-            *self
-        }
-    }
-
-    #[inline(always)]
-    fn add_by_base_field(&self, base: &Self::BaseField) -> Self {
-        let mut res = *self;
-        res.v = unsafe { _mm_xor_si128(res.v, _mm_set_epi64x(0, base.v as i64)) };
-        res
-    }
-
-    /// 
-    #[inline]
-    fn mul_by_x(&self) -> Self {
-        unsafe {
-            // Shift left by 1 bit
-            let shifted = _mm_slli_epi64(self.v, 1);
-
-            // Get the most significant bit and move it
-            let msb = _mm_srli_epi64(self.v, 63); 
-            let msb_moved = _mm_slli_si128(msb, 8); 
-
-            // Combine the shifted value with the moved msb
-            let shifted_consolidated = _mm_or_si128(shifted, msb_moved);
-
-            // Create the reduction value (0b11) and the comparison value (1)
-            let reduction = {
-                let multiplier = _mm_set_epi64x(0, 0b11);
-                let one = _mm_set_epi64x(0, 1);
-
-                // Check if the MSB was 1 and create a mask
-                let mask = _mm_cmpeq_epi64(
-                    _mm_srli_si128(_mm_srli_epi64(shifted, 63), 8), 
-                    one);
-
-                _mm_and_si128(mask, multiplier)
-            };
-
-            // Apply the reduction conditionally
-            let res = _mm_xor_si128(shifted_consolidated, reduction);
-
-            Self { v: res }
-        }
-    }
-}
-
-impl From<GF2> for AVX512GF2_127 {
-    #[inline(always)]
-    fn from(v: GF2) -> Self {
-        AVX512GF2_127 {
-            v: unsafe { _mm_set_epi64x(0, v.v as i64) },
-        }
-    }
-}
-
-const X0TO126_MASK: __m128i = unsafe { transmute::<[u8; 16], __m128i>(
-    [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F])};
-const X127_MASK: __m128i = unsafe { transmute::<[u8; 16], __m128i>(
-    [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80])};
-const X127_REMINDER: __m128i = unsafe { transmute::<[u8; 16], __m128i>(
-    [0b11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80])};
-
-
-#[inline(always)]
-unsafe fn mm_bitshift_left<const count: usize>(x: __m128i) -> __m128i
-{
-    let mut carry = _mm_bslli_si128(x, 8); 
-    carry = _mm_srli_epi64(carry, 64 - count);  
-    let x = _mm_slli_epi64(x, count);
-    _mm_or_si128(x, carry)
-}
-
-
-#[inline]
-unsafe fn gfmul(a: __m128i, b: __m128i) -> __m128i {
-    let xmm_mask = _mm_setr_epi32((0xFFffffff_u32) as i32, 0x0, 0x0, 0x0);
-
-    // a = a0|a1, b = b0|b1
-
-    let mut tmp3 = _mm_clmulepi64_si128(a, b, 0x00); // tmp3 = a0 * b0
-    let mut tmp6 = _mm_clmulepi64_si128(a, b, 0x11); // tmp6 = a1 * b1
-
-    // 78 = 0b0100_1110
-    let mut tmp4 = _mm_shuffle_epi32(a, 78); // tmp4 = a1|a0
-    let mut tmp5 = _mm_shuffle_epi32(b, 78); // tmp5 = b1|b0
-    tmp4 = _mm_xor_si128(tmp4, a); // tmp4 = (a0 + a1) | (a0 + a1)
-    tmp5 = _mm_xor_si128(tmp5, b); // tmp5 = (b0 + b1) | (b0 + b1)
-
-    tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); // tmp4 = (a0 + a1) * (b0 + b1)
-    tmp4 = _mm_xor_si128(tmp4, tmp3); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0
-    tmp4 = _mm_xor_si128(tmp4, tmp6); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0 - a1 * b1 = a0 * b1 + a1 * b0
-
-    // tmp4 = e1 | e0
-    tmp5 = _mm_slli_si128(tmp4, 8); // tmp5 = e0 | 00
-    tmp4 = _mm_srli_si128(tmp4, 8); // tmp4 = 00 | e1
-    tmp3 = _mm_xor_si128(tmp3, tmp5); // the lower 128 bits, deg 0 - 127
-    tmp6 = _mm_xor_si128(tmp6, tmp4); // the higher 128 bits, deg 128 - 252, the 124 least signicicant bits are non-zero
-
-    // x^0 - x^126
-    let x0to126 = _mm_and_si128(tmp3, X0TO126_MASK);
-
-    // x^127 
-   tmp4 = _mm_and_si128(tmp3, X127_MASK);
-   tmp4 = _mm_cmpeq_epi64(tmp4, X127_MASK);
-   tmp4 = _mm_srli_si128(tmp4, 15);
-   let x127 = _mm_and_si128(tmp4, X127_REMINDER);
-
-   // x^128 - x^252
-   let x128to252 = 
-    _mm_and_si128(
-        mm_bitshift_left::<2>(tmp6),
-        mm_bitshift_left::<1>(tmp6),
-    );
-
-    _mm_and_si128(_mm_and_si128(x0to126, x127), x128to252)
-
-    // let mut tmp7 = _mm_srli_epi32(tmp6, 31);
-    // let mut tmp8 = _mm_srli_epi32(tmp6, 30);
-    // let tmp9 = _mm_srli_epi32(tmp6, 25);
-
-    // tmp7 = _mm_xor_si128(tmp7, tmp8);
-    // tmp7 = _mm_xor_si128(tmp7, tmp9);
-
-    // tmp8 = _mm_shuffle_epi32(tmp7, 147);
-    // tmp7 = _mm_and_si128(xmm_mask, tmp8);
-    // tmp8 = _mm_andnot_si128(xmm_mask, tmp8);
-
-    // tmp3 = _mm_xor_si128(tmp3, tmp8);
-    // tmp6 = _mm_xor_si128(tmp6, tmp7);
-
-    // let tmp10 = _mm_slli_epi32(tmp6, 1);
-    // tmp3 = _mm_xor_si128(tmp3, tmp10);
-
-    // let tmp11 = _mm_slli_epi32(tmp6, 2);
-    // tmp3 = _mm_xor_si128(tmp3, tmp11);
-
-    // let tmp12 = _mm_slli_epi32(tmp6, 7);
-    // tmp3 = _mm_xor_si128(tmp3, tmp12);
-
-    // _mm_xor_si128(tmp3, tmp6)
-
-}
-
-impl Default for AVX512GF2_127 {
-    #[inline(always)]
-    fn default() -> Self {
-        Self::zero()
-    }
-}
-
-impl PartialEq for AVX512GF2_127 {
-    #[inline(always)]
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { _mm_test_all_ones(_mm_cmpeq_epi8(self.v, other.v)) == 1 }
-    }
-}
-
-impl Neg for AVX512GF2_127 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn neg(self) -> Self {
-        self
-    }
-}
-
-impl From<u32> for AVX512GF2_127 {
-    #[inline(always)]
-    fn from(v: u32) -> Self {
-        AVX512GF2_127 {
-            v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([v, 0, 0, 0]) },
-        }
-    }
-}
-
-#[inline(always)]
-fn add_internal(a: &AVX512GF2_127, b: &AVX512GF2_127) -> AVX512GF2_127 {
-    AVX512GF2_127 {
-        v: unsafe { _mm_xor_si128(a.v, b.v) },
-    }
-}
-
-#[inline(always)]
-fn sub_internal(a: &AVX512GF2_127, b: &AVX512GF2_127) -> AVX512GF2_127 {
-    AVX512GF2_127 {
-        v: unsafe { _mm_xor_si128(a.v, b.v) },
-    }
-}
-
-#[inline(always)]
-fn mul_internal(a: &AVX512GF2_127, b: &AVX512GF2_127) -> AVX512GF2_127 {
-    AVX512GF2_127 {
-        v: unsafe { gfmul(a.v, b.v) },
-    }
-}
diff --git a/arith/src/extension_field/gf2_127/neon.rs b/arith/src/extension_field/gf2_127/neon.rs
deleted file mode 100644
index e69de29b..00000000
diff --git a/arith/src/extension_field/gf2_128.rs b/arith/src/extension_field/gf2_128.rs
index e5cb17bb..8ba6fc03 100644
--- a/arith/src/extension_field/gf2_128.rs
+++ b/arith/src/extension_field/gf2_128.rs
@@ -3,7 +3,12 @@ pub(crate) mod neon;
 #[cfg(target_arch = "aarch64")]
 pub type GF2_128 = neon::NeonGF2_128;
 
-#[cfg(target_arch = "x86_64")]
-mod avx;
-#[cfg(target_arch = "x86_64")]
-pub type GF2_128 = avx::AVX512GF2_128;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+mod avx512;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub type GF2_128 = avx512::AVX512GF2_128;
+
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+mod avx256;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub type GF2_128 = avx256::AVX512GF2_128;
diff --git a/arith/src/extension_field/gf2_128/avx.rs b/arith/src/extension_field/gf2_128/avx512.rs
similarity index 100%
rename from arith/src/extension_field/gf2_128/avx.rs
rename to arith/src/extension_field/gf2_128/avx512.rs
diff --git a/arith/src/extension_field/gf2_128x4/avx256.rs b/arith/src/extension_field/gf2_128x4/avx256.rs
deleted file mode 100644
index 1783f708..00000000
--- a/arith/src/extension_field/gf2_128x4/avx256.rs
+++ /dev/null
@@ -1,466 +0,0 @@
-use crate::field_common;
-
-use crate::{Field, FieldSerde, FieldSerdeResult, SimdField, GF2_128};
-use std::fmt::Debug;
-use std::{
-    arch::x86_64::*,
-    iter::{Product, Sum},
-    mem::transmute,
-    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
-};
-
-#[derive(Clone, Copy)]
-pub struct AVX256GF2_128x4 {
-    data: [__m256i; 2];
-}
-
-field_common!(AVX256GF2_128x4);
-
-impl AVX256GF2_128x4 {
-    #[inline(always)]
-    pub(crate) fn pack_full(data: __m128i) -> [__m256i; 2] {
-        unsafe { [_mm256_broadcast_i32x4(data), _mm256_broadcast_i32x4(data)] }
-    }
-}
-
-impl FieldSerde for AVX256GF2_128x4 {
-    const SERIALIZED_SIZE: usize = 512 / 8;
-
-    #[inline(always)]
-    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
-        unsafe {
-            let mut data = [0u8; 64];
-            _mm256_storeu_si256(data.as_mut_ptr() as *mut i32, self.data[0]);
-            _mm256_storeu_si256(data.as_mut_ptr().add(32) as *mut i32, self.data[1]);
-            writer.write_all(&data)?;
-        }
-        Ok(())
-    }
-    #[inline(always)]
-    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut data = [0u8; Self::SERIALIZED_SIZE];
-        reader.read_exact(&mut data)?;
-        unsafe {
-            Ok(Self {
-                data: [_mm256_loadu_si256(data.as_ptr() as *const i32), _mm256_loadu_si256(data.as_ptr().add(8) as *const i32)],
-            })
-        }
-    }
-
-    #[inline(always)]
-    fn try_deserialize_from_ecc_format<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut buf = [0u8; 32];
-        reader.read_exact(&mut buf)?;
-        let data: __m128i = unsafe { _mm_loadu_si128(buf.as_ptr() as *const __m128i) };
-        Ok(Self {
-            data: Self::pack_full(data),
-        })
-    }
-}
-
-const PACKED_0: __m256i = unsafe { transmute([0; 4]) };
-
-const PACKED_INV_2: __m256i = unsafe {
-    transmute([
-        67_u64,
-        (1_u64) << 63,
-        67_u64,
-        (1_u64) << 63,
-    ])
-};
-
-// p(x) = x^128 + x^7 + x^2 + x + 1
-impl Field for AVX256GF2_128x4 {
-    const NAME: &'static str = "AVX256 Galios Field 2^128";
-
-    // size in bytes
-    const SIZE: usize = 512 / 8;
-
-    const ZERO: Self = Self { data: PACKED_0 };
-
-    const INV_2: Self = Self { data: PACKED_INV_2 };
-
-    const FIELD_SIZE: usize = 128;
-
-    #[inline(always)]
-    fn zero() -> Self {
-        unsafe {
-            let zero = _mm256_setzero_si256();
-            Self { data: [zero, zero] }
-        }
-    }
-
-    #[inline(always)]
-    fn is_zero(&self) -> bool {
-        unsafe {
-            let zero = _mm256_setzero_si256();
-            let cmp = _mm256_cmpeq_epi64_mask(self.data[0], zero) & _mm256_cmpeq_epi64_mask(self.data[1], zero);
-            cmp == 0xFF // All 8 64-bit integers are equal (zero)
-        }
-    }
-
-    #[inline(always)]
-    fn one() -> Self {
-        unsafe {
-            let one = _mm256_set_epi64(0, 1, 0, 1);
-            Self { data: [one, one] }
-        }
-    }
-
-    #[inline(always)]
-    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
-        let data = unsafe {
-            _mm256_set_epi64(
-                rng.next_u64() as i64,
-                rng.next_u64() as i64,
-                rng.next_u64() as i64,
-                rng.next_u64() as i64,
-            )
-        };
-        Self { data }
-    }
-
-    #[inline(always)]
-    fn random_bool(mut rng: impl rand::RngCore) -> Self {
-        let data = unsafe {
-            _mm256_set_epi64(
-                0,
-                (rng.next_u64() % 2) as i64,
-                0,
-                (rng.next_u64() % 2) as i64,
-            )
-        };
-        Self { data }
-    }
-
-    #[inline(always)]
-    fn exp(&self, exponent: u128) -> Self {
-        let mut e = exponent;
-        let mut res = Self::one();
-        let mut t = *self;
-        while e != 0 {
-            let b = e & 1;
-            if b == 1 {
-                res *= t;
-            }
-            t = t * t;
-            e >>= 1;
-        }
-        res
-    }
-
-    #[inline(always)]
-    fn inv(&self) -> Option<Self> {
-        if self.is_zero() {
-            return None;
-        }
-        let p_m2 = !(0u128) - 1;
-        Some(Self::exp(self, p_m2))
-    }
-
-    #[inline(always)]
-    fn as_u32_unchecked(&self) -> u32 {
-        unimplemented!("self is a vector, cannot convert to u32")
-    }
-
-    #[inline(always)]
-    fn from_uniform_bytes(_bytes: &[u8; 32]) -> Self {
-        todo!()
-    }
-
-    #[inline(always)]
-    fn square(&self) -> Self {
-        *self * *self
-    }
-
-    #[inline(always)]
-    fn double(&self) -> Self {
-        Self::ZERO
-    }
-
-    #[inline(always)]
-    fn mul_by_2(&self) -> Self {
-        Self::ZERO
-    }
-
-    #[inline(always)]
-    fn mul_by_3(&self) -> Self {
-        *self
-    }
-
-    #[inline(always)]
-    fn mul_by_5(&self) -> Self {
-        *self
-    }
-
-    #[inline(always)]
-    fn mul_by_6(&self) -> Self {
-        Self::ZERO
-    }
-}
-/*
-credit to intel for the original implementation
-void gfmul(__m128i a, __m128i b, __m128i *res) {
-    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-    __m128i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
-    __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
-
-    // a = a0|a1, b = b0|b1
-
-    tmp3 = _mm_clmulepi64_si128(a, b, 0x00); // tmp3 = a0 * b0
-    tmp6 = _mm_clmulepi64_si128(a, b, 0x11); // tmp6 = a1 * b1
-
-    tmp4 = _mm_shuffle_epi32(a, 78); // tmp4 = a1|a0
-    tmp5 = _mm_shuffle_epi32(b, 78); // tmp5 = b1|b0
-    tmp4 = _mm_xor_si128(tmp4, a); // tmp4 = (a0 + a1) | (a0 + a1)
-    tmp5 = _mm_xor_si128(tmp5, b); // tmp5 = (b0 + b1) | (b0 + b1)
-
-    tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); // tmp4 = (a0 + a1) * (b0 + b1)
-    tmp4 = _mm_xor_si128(tmp4, tmp3); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0
-    tmp4 = _mm_xor_si128(tmp4, tmp6); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0 - a1 * b1 = a0 * b1 + a1 * b0
-
-    tmp5 = _mm_slli_si128(tmp4, 8);
-    tmp4 = _mm_srli_si128(tmp4, 8);
-    tmp3 = _mm_xor_si128(tmp3, tmp5);
-    tmp6 = _mm_xor_si128(tmp6, tmp4);
-
-    tmp7 = _mm_srli_epi32(tmp6, 31);
-    tmp8 = _mm_srli_epi32(tmp6, 30);
-    tmp9 = _mm_srli_epi32(tmp6, 25);
-
-    tmp7 = _mm_xor_si128(tmp7, tmp8);
-    tmp7 = _mm_xor_si128(tmp7, tmp9);
-
-    tmp8 = _mm_shuffle_epi32(tmp7, 147);
-    tmp7 = _mm_and_si128(XMMMASK, tmp8);
-    tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
-
-    tmp3 = _mm_xor_si128(tmp3, tmp8);
-    tmp6 = _mm_xor_si128(tmp6, tmp7);
-
-    tmp10 = _mm_slli_epi32(tmp6, 1);
-    tmp3 = _mm_xor_si128(tmp3, tmp10);
-
-    tmp11 = _mm_slli_epi32(tmp6, 2);
-    tmp3 = _mm_xor_si128(tmp3, tmp11);
-
-    tmp12 = _mm_slli_epi32(tmp6, 7);
-    tmp3 = _mm_xor_si128(tmp3, tmp12);
-
-    *res = _mm_xor_si128(tmp3, tmp6);
-}
-
-*/
-
-/*
-AVX 512 version
-void gfmul_avx512(__m512i a, __m512i b, __m512i *res) {
-    __m512i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-    __m512i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
-    __m512i XMMMASK = _mm512_set_epi32(
-        0, 0, 0, 0xffffffff,
-        0, 0, 0, 0xffffffff,
-        0, 0, 0, 0xffffffff,
-        0, 0, 0, 0xffffffff
-    );
-
-    tmp3 = _mm512_clmulepi64_epi128(a, b, 0x00);
-    tmp6 = _mm512_clmulepi64_epi128(a, b, 0x11);
-
-    tmp4 = _mm512_shuffle_epi32(a, _MM_PERM_BADC);
-    tmp5 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
-    tmp4 = _mm512_xor_si512(tmp4, a);
-    tmp5 = _mm512_xor_si512(tmp5, b);
-
-    tmp4 = _mm512_clmulepi64_epi128(tmp4, tmp5, 0x00);
-    tmp4 = _mm512_xor_si512(tmp4, tmp3);
-    tmp4 = _mm512_xor_si512(tmp4, tmp6);
-
-    tmp5 = _mm512_bslli_epi128(tmp4, 8);
-    tmp4 = _mm512_bsrli_epi128(tmp4, 8);
-    tmp3 = _mm512_xor_si512(tmp3, tmp5);
-    tmp6 = _mm512_xor_si512(tmp6, tmp4);
-
-    tmp7 = _mm512_srli_epi32(tmp6, 31);
-    tmp8 = _mm512_srli_epi32(tmp6, 30);
-    tmp9 = _mm512_srli_epi32(tmp6, 25);
-
-    tmp7 = _mm512_xor_si512(tmp7, tmp8);
-    tmp7 = _mm512_xor_si512(tmp7, tmp9);
-
-    tmp8 = _mm512_shuffle_epi32(tmp7, _MM_PERM_ABCD);
-    tmp7 = _mm512_and_si512(XMMMASK, tmp8);
-    tmp8 = _mm512_andnot_si512(XMMMASK, tmp8);
-
-    tmp3 = _mm512_xor_si512(tmp3, tmp8);
-    tmp6 = _mm512_xor_si512(tmp6, tmp7);
-
-    tmp10 = _mm512_slli_epi32(tmp6, 1);
-    tmp3 = _mm512_xor_si512(tmp3, tmp10);
-
-    tmp11 = _mm512_slli_epi32(tmp6, 2);
-    tmp3 = _mm512_xor_si512(tmp3, tmp11);
-
-    tmp12 = _mm512_slli_epi32(tmp6, 7);
-    tmp3 = _mm512_xor_si512(tmp3, tmp12);
-
-    *res = _mm512_xor_si512(tmp3, tmp6);
-}
- */
-
-impl From<u32> for AVX256GF2_128x4 {
-    #[inline(always)]
-    fn from(v: u32) -> AVX256GF2_128x4 {
-        assert!(v < 2); // only 0 and 1 are allowed
-        let data = unsafe { [_mm256_set_epi64(0, v as i64, 0, v as i64, 0), _mm256_set_epi64(0, v as i64, 0, v as i64, 0)] };
-        AVX256GF2_128x4 { data }
-    }
-}
-
-impl Neg for AVX256GF2_128x4 {
-    type Output = AVX256GF2_128x4;
-
-    #[inline(always)]
-    fn neg(self) -> AVX256GF2_128x4 {
-        self
-    }
-}
-
-impl Debug for AVX256GF2_128x4 {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let mut data = [0u8; 64];
-        unsafe {
-            _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, self.data[0]);
-            _mm256_storeu_si256(data.as_mut_ptr().add(8) as *mut __m256i, self.data[1]);
-        }
-        f.debug_struct("AVX256GF2_128x4")
-            .field("data", &data)
-            .finish()
-    }
-}
-
-impl PartialEq for AVX256GF2_128x4 {
-    #[inline(always)]
-    fn eq(&self, other: &Self) -> bool {
-        unsafe {
-            let cmp = _mm256_cmpeq_epi64_mask(self.data[0], other.data[0]) & _mm256_cmpeq_epi64_mask(self.data[1], other.data[1]);
-            cmp == 0xFF // All 8 64-bit integers are equal
-        }
-    }
-}
-
-impl Default for AVX256GF2_128x4 {
-    #[inline(always)]
-    fn default() -> Self {
-        Self::zero()
-    }
-}
-
-impl From<GF2_128> for AVX256GF2_128x4 {
-    #[inline(always)]
-    fn from(v: GF2_128) -> AVX256GF2_128x4 {
-        unsafe {
-            let mut result = [_mm256_setzero_si256(), _mm256_setzero_si256()]; // Initialize a zeroed _m512i
-            result[0] = _mm256_inserti32x4(result[0], v.v, 0); // Insert `a` at position 0
-            result[0] = _mm256_inserti32x4(result[0], v.v, 1); // Insert `b` at position 1
-            result[1] = _mm256_inserti32x4(result[1], v.v, 2); // Insert `c` at position 2
-            result[1] = _mm256_inserti32x4(result[1], v.v, 3); // Insert `d` at position 3
-            AVX256GF2_128x4 { data: result }
-        }
-    }
-}
-
-impl SimdField for AVX256GF2_128x4 {
-    #[inline(always)]
-    fn scale(&self, challenge: &Self::Scalar) -> Self {
-        let simd_challenge = AVX256GF2_128x4::from(*challenge);
-        *self * simd_challenge
-    }
-    type Scalar = GF2_128;
-
-    #[inline(always)]
-    fn pack_size() -> usize {
-        4
-    }
-}
-
-#[inline(always)]
-fn add_internal(a: &AVX256GF2_128x4, b: &AVX256GF2_128x4) -> AVX256GF2_128x4 {
-    unsafe {
-        AVX256GF2_128x4 {
-            data: [_mm256_xor_si256(a.data[0], b.data[0]), _mm256_xor_si256(a.data[1], b.data[1])],
-        }
-    }
-}
-
-#[inline(always)]
-fn sub_internal(a: &AVX256GF2_128x4, b: &AVX256GF2_128x4) -> AVX256GF2_128x4 {
-    unsafe {
-        AVX256GF2_128x4 {
-            data: [_mm256_xor_si256(a.data[0], b.data[0]), _mm256_xor_si256(a.data[1], b.data[1])],
-        }
-    }
-}
-
-#[inline]
-fn mul_internal(a: &AVX256GF2_128x4, b: &AVX256GF2_128x4) -> AVX256GF2_128x4 {
-    unsafe {
-        let xmmmask = _mm256_set_epi32(
-            0,
-            0,
-            0,
-            0xffffffffu32 as i32,
-            0,
-            0,
-            0,
-            0xffffffffu32 as i32,
-        );
-        let mut result = [_mm256_setzero_si256(), _mm256_setzero_si256()];
-        for i in 0..2 {
-
-            let mut tmp3 = _mm256_clmulepi64_epi128(a.data[i], b.data[i], 0x00);
-            let mut tmp6 = _mm256_clmulepi64_epi128(a.data[i], b.data[i], 0x11);
-
-            let mut tmp4 = _mm256_shuffle_epi32(a.data[i], _MM_PERM_BADC);
-            let mut tmp5 = _mm256_shuffle_epi32(b.data[i], _MM_PERM_BADC);
-            tmp4 = _mm256_xor_si256(tmp4, a.data[i]);
-            tmp5 = _mm256_xor_si256(tmp5, b.data[i]);
-
-            tmp4 = _mm256_clmulepi64_epi128(tmp4, tmp5, 0x00);
-            tmp4 = _mm256_xor_si256(tmp4, tmp3);
-            tmp4 = _mm256_xor_si256(tmp4, tmp6);
-
-            tmp5 = _mm256_bslli_epi128(tmp4, 8);
-            tmp4 = _mm256_bsrli_epi128(tmp4, 8);
-            tmp3 = _mm256_xor_si256(tmp3, tmp5);
-            tmp6 = _mm256_xor_si256(tmp6, tmp4);
-
-            let tmp7 = _mm256_srli_epi32(tmp6, 31);
-            let tmp8 = _mm256_srli_epi32(tmp6, 30);
-            let tmp9 = _mm256_srli_epi32(tmp6, 25);
-
-            let mut tmp7 = _mm256_xor_si256(tmp7, tmp8);
-            tmp7 = _mm256_xor_si256(tmp7, tmp9);
-
-            let mut tmp8 = _mm256_shuffle_epi32(tmp7, _MM_PERM_CBAD);
-            tmp7 = _mm256_and_si256(xmmmask, tmp8);
-            tmp8 = _mm256_andnot_si256(xmmmask, tmp8);
-
-            tmp3 = _mm256_xor_si256(tmp3, tmp8);
-            tmp6 = _mm256_xor_si256(tmp6, tmp7);
-
-            let tmp10 = _mm256_slli_epi32(tmp6, 1);
-            tmp3 = _mm256_xor_si256(tmp3, tmp10);
-
-            let tmp11 = _mm256_slli_epi32(tmp6, 2);
-            tmp3 = _mm256_xor_si256(tmp3, tmp11);
-
-            let tmp12 = _mm256_slli_epi32(tmp6, 7);
-            tmp3 = _mm256_xor_si256(tmp3, tmp12);
-
-            result[i] = _mm256_xor_si256(tmp3, tmp6);
-
-        }
-        AVX256GF2_128x4 { data: result }
-    }
-}
diff --git a/arith/src/extension_field/gf2_128x8.rs b/arith/src/extension_field/gf2_128x8.rs
index ba393103..e250159d 100644
--- a/arith/src/extension_field/gf2_128x8.rs
+++ b/arith/src/extension_field/gf2_128x8.rs
@@ -3,13 +3,12 @@ pub(crate) mod neon;
 #[cfg(target_arch = "aarch64")]
 pub type GF2_128x8 = neon::NeonGF2_128x8;
 
-#[cfg(target_arch = "x86_64")]
-mod avx;
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+mod avx512;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub type GF2_128x8 = avx512::AVX512GF2_128x8;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 mod avx256;
-#[cfg(target_arch = "x86_64")]
-pub type GF2_128x8_256 = avx256::AVX256GF2_128x8;
-#[cfg(all(target_arch = "x86_64", feature = "avx256"))]
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 pub type GF2_128x8 = avx256::AVX256GF2_128x8;
-#[cfg(all(target_arch = "x86_64", not(feature = "avx256")))]
-pub type GF2_128x8 = avx::AVX512GF2_128x8;
diff --git a/arith/src/extension_field/gf2_128x8/avx.rs b/arith/src/extension_field/gf2_128x8/avx512.rs
similarity index 100%
rename from arith/src/extension_field/gf2_128x8/avx.rs
rename to arith/src/extension_field/gf2_128x8/avx512.rs
diff --git a/arith/src/field/m31.rs b/arith/src/field/m31.rs
index 628f3add..6c12ff27 100644
--- a/arith/src/field/m31.rs
+++ b/arith/src/field/m31.rs
@@ -1,11 +1,12 @@
 mod m31x16;
 pub use m31x16::M31x16;
 
-#[cfg(target_arch = "x86_64")]
-pub(crate) mod m31_avx;
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
 pub(crate) mod m31_avx256;
 
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub(crate) mod m31_avx512;
+
 #[cfg(target_arch = "x86_64")]
 pub type M31x16_256 = m31_avx256::AVXM31;
 
diff --git a/arith/src/field/m31/m31_avx.rs b/arith/src/field/m31/m31_avx512.rs
similarity index 100%
rename from arith/src/field/m31/m31_avx.rs
rename to arith/src/field/m31/m31_avx512.rs
diff --git a/arith/src/field/m31/m31x16.rs b/arith/src/field/m31/m31x16.rs
index c00456d2..b111b6e7 100644
--- a/arith/src/field/m31/m31x16.rs
+++ b/arith/src/field/m31/m31x16.rs
@@ -1,14 +1,12 @@
 // A M31x16 stores 512 bits of data.
 // With AVX it stores a single __m512i element.
 // With NEON it stores four uint32x4_t elements.
-#[cfg(target_arch = "x86_64")]
-cfg_if::cfg_if! {
-    if #[cfg(feature = "avx256")] {
-        pub type M31x16 = super::m31_avx256::AVXM31;
-    } else {
-        pub type M31x16 = super::m31_avx::AVXM31;
-    }
-}
 
 #[cfg(target_arch = "aarch64")]
 pub type M31x16 = super::m31_neon::NeonM31;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub type M31x16 = super::m31_avx512::AVXM31;
+
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub type M31x16 = super::m31_avx256::AVXM31;

From e5ab6982b89304d23bdc505242e45f9508154153 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 10:07:17 -0400
Subject: [PATCH 2/7] fix

---
 .github/workflows/ci.yml                    |  6 +--
 arith/src/extension_field/gf2_128.rs        |  4 +-
 arith/src/extension_field/gf2_128/avx256.rs | 58 ++++++++++-----------
 arith/src/extension_field/gf2_128x8.rs      |  8 +--
 readme.md                                   | 11 +++-
 5 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4dd75704..33a1f29d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -67,13 +67,13 @@ jobs:
             feature: avx2
             field: gf2ext128
           - os: 7950x3d
-            feature: avx512
+            feature: avx512f
             field: m31ext3
           - os: 7950x3d
-            feature: avx512
+            feature: avx512f
             field: fr
           - os: 7950x3d
-            feature: avx512
+            feature: avx512f
             field: gf2ext128
     steps:
       - uses: actions/checkout@v4
diff --git a/arith/src/extension_field/gf2_128.rs b/arith/src/extension_field/gf2_128.rs
index 8ba6fc03..334e8cd7 100644
--- a/arith/src/extension_field/gf2_128.rs
+++ b/arith/src/extension_field/gf2_128.rs
@@ -10,5 +10,5 @@ pub type GF2_128 = avx512::AVX512GF2_128;
 
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
 mod avx256;
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub type GF2_128 = avx256::AVX512GF2_128;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub type GF2_128 = avx256::AVX256GF2_128;
diff --git a/arith/src/extension_field/gf2_128/avx256.rs b/arith/src/extension_field/gf2_128/avx256.rs
index f3984f88..ba090b8b 100644
--- a/arith/src/extension_field/gf2_128/avx256.rs
+++ b/arith/src/extension_field/gf2_128/avx256.rs
@@ -8,13 +8,13 @@ use std::{
 use crate::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, GF2};
 
 #[derive(Debug, Clone, Copy)]
-pub struct AVX512GF2_128 {
+pub struct AVX256GF2_128 {
     pub v: __m128i,
 }
 
-field_common!(AVX512GF2_128);
+field_common!(AVX256GF2_128);
 
-impl FieldSerde for AVX512GF2_128 {
+impl FieldSerde for AVX256GF2_128 {
     const SERIALIZED_SIZE: usize = 16;
 
     #[inline(always)]
@@ -28,7 +28,7 @@ impl FieldSerde for AVX512GF2_128 {
         let mut u = [0u8; Self::SERIALIZED_SIZE];
         reader.read_exact(&mut u)?;
         unsafe {
-            Ok(AVX512GF2_128 {
+            Ok(AVX256GF2_128 {
                 v: transmute::<[u8; Self::SERIALIZED_SIZE], __m128i>(u),
             })
         }
@@ -39,42 +39,42 @@ impl FieldSerde for AVX512GF2_128 {
         let mut u = [0u8; 32];
         reader.read_exact(&mut u)?;
         Ok(unsafe {
-            AVX512GF2_128 {
+            AVX256GF2_128 {
                 v: transmute::<[u8; 16], __m128i>(u[..16].try_into().unwrap()),
             }
         })
     }
 }
 
-impl Field for AVX512GF2_128 {
+impl Field for AVX256GF2_128 {
     const NAME: &'static str = "Galios Field 2^128";
 
     const SIZE: usize = 128 / 8;
 
     const FIELD_SIZE: usize = 128; // in bits
 
-    const ZERO: Self = AVX512GF2_128 {
+    const ZERO: Self = AVX256GF2_128 {
         v: unsafe { std::mem::zeroed() },
     };
 
-    const ONE: Self = AVX512GF2_128 {
+    const ONE: Self = AVX256GF2_128 {
         v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) },
     };
 
-    const INV_2: Self = AVX512GF2_128 {
+    const INV_2: Self = AVX256GF2_128 {
         v: unsafe { std::mem::zeroed() },
     }; // should not be used
 
     #[inline(always)]
     fn zero() -> Self {
-        AVX512GF2_128 {
+        AVX256GF2_128 {
             v: unsafe { std::mem::zeroed() },
         }
     }
 
     #[inline(always)]
     fn one() -> Self {
-        AVX512GF2_128 {
+        AVX256GF2_128 {
             // 1 in the first bit
             v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) }, // TODO check bit order
         }
@@ -85,7 +85,7 @@ impl Field for AVX512GF2_128 {
         let mut u = [0u8; 16];
         rng.fill_bytes(&mut u);
         unsafe {
-            AVX512GF2_128 {
+            AVX256GF2_128 {
                 v: *(u.as_ptr() as *const __m128i),
             }
         }
@@ -93,7 +93,7 @@ impl Field for AVX512GF2_128 {
 
     #[inline(always)]
     fn random_bool(mut rng: impl rand::RngCore) -> Self {
-        AVX512GF2_128 {
+        AVX256GF2_128 {
             v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([rng.next_u32() % 2, 0, 0, 0]) },
         }
     }
@@ -140,19 +140,19 @@ impl Field for AVX512GF2_128 {
     #[inline(always)]
     fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
         unsafe {
-            AVX512GF2_128 {
+            AVX256GF2_128 {
                 v: transmute::<[u8; 16], __m128i>(bytes[..16].try_into().unwrap()),
             }
         }
     }
 }
 
-impl ExtensionField for AVX512GF2_128 {
+impl ExtensionField for AVX256GF2_128 {
     const DEGREE: usize = 128;
 
     const W: u32 = 0x87;
 
-    const X: Self = AVX512GF2_128 {
+    const X: Self = AVX256GF2_128 {
         v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([2, 0, 0, 0]) },
     };
 
@@ -206,10 +206,10 @@ impl ExtensionField for AVX512GF2_128 {
     }
 }
 
-impl From<GF2> for AVX512GF2_128 {
+impl From<GF2> for AVX256GF2_128 {
     #[inline(always)]
     fn from(v: GF2) -> Self {
-        AVX512GF2_128 {
+        AVX256GF2_128 {
             v: unsafe { _mm_set_epi64x(0, v.v as i64) },
         }
     }
@@ -264,21 +264,21 @@ unsafe fn gfmul(a: __m128i, b: __m128i) -> __m128i {
     _mm_xor_si128(tmp3, tmp6)
 }
 
-impl Default for AVX512GF2_128 {
+impl Default for AVX256GF2_128 {
     #[inline(always)]
     fn default() -> Self {
         Self::zero()
     }
 }
 
-impl PartialEq for AVX512GF2_128 {
+impl PartialEq for AVX256GF2_128 {
     #[inline(always)]
     fn eq(&self, other: &Self) -> bool {
         unsafe { _mm_test_all_ones(_mm_cmpeq_epi8(self.v, other.v)) == 1 }
     }
 }
 
-impl Neg for AVX512GF2_128 {
+impl Neg for AVX256GF2_128 {
     type Output = Self;
 
     #[inline(always)]
@@ -287,32 +287,32 @@ impl Neg for AVX512GF2_128 {
     }
 }
 
-impl From<u32> for AVX512GF2_128 {
+impl From<u32> for AVX256GF2_128 {
     #[inline(always)]
     fn from(v: u32) -> Self {
-        AVX512GF2_128 {
+        AVX256GF2_128 {
             v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([v, 0, 0, 0]) },
         }
     }
 }
 
 #[inline(always)]
-fn add_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
+fn add_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
+    AVX256GF2_128 {
         v: unsafe { _mm_xor_si128(a.v, b.v) },
     }
 }
 
 #[inline(always)]
-fn sub_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
+fn sub_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
+    AVX256GF2_128 {
         v: unsafe { _mm_xor_si128(a.v, b.v) },
     }
 }
 
 #[inline(always)]
-fn mul_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
+fn mul_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
+    AVX256GF2_128 {
         v: unsafe { gfmul(a.v, b.v) },
     }
 }
diff --git a/arith/src/extension_field/gf2_128x8.rs b/arith/src/extension_field/gf2_128x8.rs
index e250159d..e610ea12 100644
--- a/arith/src/extension_field/gf2_128x8.rs
+++ b/arith/src/extension_field/gf2_128x8.rs
@@ -4,11 +4,11 @@ pub(crate) mod neon;
 pub type GF2_128x8 = neon::NeonGF2_128x8;
 
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-mod avx512;
+mod avx256;
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-pub type GF2_128x8 = avx512::AVX512GF2_128x8;
+pub type GF2_128x8 = avx256::AVX256GF2_128x8;
 
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-mod avx256;
+mod avx512;
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub type GF2_128x8 = avx256::AVX256GF2_128x8;
+pub type GF2_128x8 = avx512::AVX512GF2_128x8;
diff --git a/readme.md b/readme.md
index 889fce3b..b0950da5 100644
--- a/readme.md
+++ b/readme.md
@@ -45,7 +45,16 @@ This compiler is your entry point for using our prover; the repository you have
 Please note that the witness generation process is not yet optimal, and we are actively working on improving it.
 
 ## AVX
-We use AVX512 by default, if your CPU doesn't support AVX512, or you encountered illegal instruction error, please use `--features avx256` instead.
+We use AVX2 by default. On an x86 or a mac, you can simply do
+```
+RUSTFLAGS="-C target-cpu=native" cargo test --release --workspace
+```
+For some platforms, if you do not indicate `target-cpu=native` it may simulate avx2 instructions, rather than use it directly, and this will cause performance decrease.
+
+Our code also supports `avx512`. This is not turned on by default. To use `avx512`
+```
+RUSTFLAGS="-C target-cpu=native -C target-features=+avx512f" cargo test --release --workspace
+```
 
 ## Environment Setup
 

From 936ad66f1be6dd873c6df92fb6eac9984c6a2605 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 10:10:09 -0400
Subject: [PATCH 3/7] fix

---
 arith/src/field/m31.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arith/src/field/m31.rs b/arith/src/field/m31.rs
index 6c12ff27..c263b955 100644
--- a/arith/src/field/m31.rs
+++ b/arith/src/field/m31.rs
@@ -7,9 +7,6 @@ pub(crate) mod m31_avx256;
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 pub(crate) mod m31_avx512;
 
-#[cfg(target_arch = "x86_64")]
-pub type M31x16_256 = m31_avx256::AVXM31;
-
 #[cfg(target_arch = "aarch64")]
 pub mod m31_neon;
 

From d008a82fbd661155dfa73e6d4e3f1c0da9e288cf Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 10:14:02 -0400
Subject: [PATCH 4/7] simplify

---
 arith/src/extension_field/gf2_128.rs          |  13 +-
 .../gf2_128/{avx256.rs => avx.rs}             |  58 ++--
 arith/src/extension_field/gf2_128/avx512.rs   | 318 ------------------
 3 files changed, 33 insertions(+), 356 deletions(-)
 rename arith/src/extension_field/gf2_128/{avx256.rs => avx.rs} (88%)
 delete mode 100644 arith/src/extension_field/gf2_128/avx512.rs

diff --git a/arith/src/extension_field/gf2_128.rs b/arith/src/extension_field/gf2_128.rs
index 334e8cd7..431673f9 100644
--- a/arith/src/extension_field/gf2_128.rs
+++ b/arith/src/extension_field/gf2_128.rs
@@ -3,12 +3,7 @@ pub(crate) mod neon;
 #[cfg(target_arch = "aarch64")]
 pub type GF2_128 = neon::NeonGF2_128;
 
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-mod avx512;
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub type GF2_128 = avx512::AVX512GF2_128;
-
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-mod avx256;
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-pub type GF2_128 = avx256::AVX256GF2_128;
+#[cfg(target_arch = "x86_64")]
+pub(crate) mod avx;
+#[cfg(target_arch = "x86_64")]
+pub type GF2_128 = avx::AVXGF2_128;
diff --git a/arith/src/extension_field/gf2_128/avx256.rs b/arith/src/extension_field/gf2_128/avx.rs
similarity index 88%
rename from arith/src/extension_field/gf2_128/avx256.rs
rename to arith/src/extension_field/gf2_128/avx.rs
index ba090b8b..d37e3675 100644
--- a/arith/src/extension_field/gf2_128/avx256.rs
+++ b/arith/src/extension_field/gf2_128/avx.rs
@@ -8,13 +8,13 @@ use std::{
 use crate::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, GF2};
 
 #[derive(Debug, Clone, Copy)]
-pub struct AVX256GF2_128 {
+pub struct AVXGF2_128 {
     pub v: __m128i,
 }
 
-field_common!(AVX256GF2_128);
+field_common!(AVXGF2_128);
 
-impl FieldSerde for AVX256GF2_128 {
+impl FieldSerde for AVXGF2_128 {
     const SERIALIZED_SIZE: usize = 16;
 
     #[inline(always)]
@@ -28,7 +28,7 @@ impl FieldSerde for AVX256GF2_128 {
         let mut u = [0u8; Self::SERIALIZED_SIZE];
         reader.read_exact(&mut u)?;
         unsafe {
-            Ok(AVX256GF2_128 {
+            Ok(AVXGF2_128 {
                 v: transmute::<[u8; Self::SERIALIZED_SIZE], __m128i>(u),
             })
         }
@@ -39,42 +39,42 @@ impl FieldSerde for AVX256GF2_128 {
         let mut u = [0u8; 32];
         reader.read_exact(&mut u)?;
         Ok(unsafe {
-            AVX256GF2_128 {
+            AVXGF2_128 {
                 v: transmute::<[u8; 16], __m128i>(u[..16].try_into().unwrap()),
             }
         })
     }
 }
 
-impl Field for AVX256GF2_128 {
+impl Field for AVXGF2_128 {
     const NAME: &'static str = "Galios Field 2^128";
 
     const SIZE: usize = 128 / 8;
 
     const FIELD_SIZE: usize = 128; // in bits
 
-    const ZERO: Self = AVX256GF2_128 {
+    const ZERO: Self = AVXGF2_128 {
         v: unsafe { std::mem::zeroed() },
     };
 
-    const ONE: Self = AVX256GF2_128 {
+    const ONE: Self = AVXGF2_128 {
         v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) },
     };
 
-    const INV_2: Self = AVX256GF2_128 {
+    const INV_2: Self = AVXGF2_128 {
         v: unsafe { std::mem::zeroed() },
     }; // should not be used
 
     #[inline(always)]
     fn zero() -> Self {
-        AVX256GF2_128 {
+        AVXGF2_128 {
             v: unsafe { std::mem::zeroed() },
         }
     }
 
     #[inline(always)]
     fn one() -> Self {
-        AVX256GF2_128 {
+        AVXGF2_128 {
             // 1 in the first bit
             v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) }, // TODO check bit order
         }
@@ -85,7 +85,7 @@ impl Field for AVX256GF2_128 {
         let mut u = [0u8; 16];
         rng.fill_bytes(&mut u);
         unsafe {
-            AVX256GF2_128 {
+            AVXGF2_128 {
                 v: *(u.as_ptr() as *const __m128i),
             }
         }
@@ -93,7 +93,7 @@ impl Field for AVX256GF2_128 {
 
     #[inline(always)]
     fn random_bool(mut rng: impl rand::RngCore) -> Self {
-        AVX256GF2_128 {
+        AVXGF2_128 {
             v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([rng.next_u32() % 2, 0, 0, 0]) },
         }
     }
@@ -140,19 +140,19 @@ impl Field for AVX256GF2_128 {
     #[inline(always)]
     fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
         unsafe {
-            AVX256GF2_128 {
+            AVXGF2_128 {
                 v: transmute::<[u8; 16], __m128i>(bytes[..16].try_into().unwrap()),
             }
         }
     }
 }
 
-impl ExtensionField for AVX256GF2_128 {
+impl ExtensionField for AVXGF2_128 {
     const DEGREE: usize = 128;
 
     const W: u32 = 0x87;
 
-    const X: Self = AVX256GF2_128 {
+    const X: Self = AVXGF2_128 {
         v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([2, 0, 0, 0]) },
     };
 
@@ -206,10 +206,10 @@ impl ExtensionField for AVX256GF2_128 {
     }
 }
 
-impl From<GF2> for AVX256GF2_128 {
+impl From<GF2> for AVXGF2_128 {
     #[inline(always)]
     fn from(v: GF2) -> Self {
-        AVX256GF2_128 {
+        AVXGF2_128 {
             v: unsafe { _mm_set_epi64x(0, v.v as i64) },
         }
     }
@@ -264,21 +264,21 @@ unsafe fn gfmul(a: __m128i, b: __m128i) -> __m128i {
     _mm_xor_si128(tmp3, tmp6)
 }
 
-impl Default for AVX256GF2_128 {
+impl Default for AVXGF2_128 {
     #[inline(always)]
     fn default() -> Self {
         Self::zero()
     }
 }
 
-impl PartialEq for AVX256GF2_128 {
+impl PartialEq for AVXGF2_128 {
     #[inline(always)]
     fn eq(&self, other: &Self) -> bool {
         unsafe { _mm_test_all_ones(_mm_cmpeq_epi8(self.v, other.v)) == 1 }
     }
 }
 
-impl Neg for AVX256GF2_128 {
+impl Neg for AVXGF2_128 {
     type Output = Self;
 
     #[inline(always)]
@@ -287,32 +287,32 @@ impl Neg for AVX256GF2_128 {
     }
 }
 
-impl From<u32> for AVX256GF2_128 {
+impl From<u32> for AVXGF2_128 {
     #[inline(always)]
     fn from(v: u32) -> Self {
-        AVX256GF2_128 {
+        AVXGF2_128 {
             v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([v, 0, 0, 0]) },
         }
     }
 }
 
 #[inline(always)]
-fn add_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
-    AVX256GF2_128 {
+fn add_internal(a: &AVXGF2_128, b: &AVXGF2_128) -> AVXGF2_128 {
+    AVXGF2_128 {
         v: unsafe { _mm_xor_si128(a.v, b.v) },
     }
 }
 
 #[inline(always)]
-fn sub_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
-    AVX256GF2_128 {
+fn sub_internal(a: &AVXGF2_128, b: &AVXGF2_128) -> AVXGF2_128 {
+    AVXGF2_128 {
         v: unsafe { _mm_xor_si128(a.v, b.v) },
     }
 }
 
 #[inline(always)]
-fn mul_internal(a: &AVX256GF2_128, b: &AVX256GF2_128) -> AVX256GF2_128 {
-    AVX256GF2_128 {
+fn mul_internal(a: &AVXGF2_128, b: &AVXGF2_128) -> AVXGF2_128 {
+    AVXGF2_128 {
         v: unsafe { gfmul(a.v, b.v) },
     }
 }
diff --git a/arith/src/extension_field/gf2_128/avx512.rs b/arith/src/extension_field/gf2_128/avx512.rs
deleted file mode 100644
index f3984f88..00000000
--- a/arith/src/extension_field/gf2_128/avx512.rs
+++ /dev/null
@@ -1,318 +0,0 @@
-use std::iter::{Product, Sum};
-use std::{
-    arch::x86_64::*,
-    mem::transmute,
-    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
-};
-
-use crate::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, GF2};
-
-#[derive(Debug, Clone, Copy)]
-pub struct AVX512GF2_128 {
-    pub v: __m128i,
-}
-
-field_common!(AVX512GF2_128);
-
-impl FieldSerde for AVX512GF2_128 {
-    const SERIALIZED_SIZE: usize = 16;
-
-    #[inline(always)]
-    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
-        unsafe { writer.write_all(transmute::<__m128i, [u8; 16]>(self.v).as_ref())? };
-        Ok(())
-    }
-
-    #[inline(always)]
-    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut u = [0u8; Self::SERIALIZED_SIZE];
-        reader.read_exact(&mut u)?;
-        unsafe {
-            Ok(AVX512GF2_128 {
-                v: transmute::<[u8; Self::SERIALIZED_SIZE], __m128i>(u),
-            })
-        }
-    }
-
-    #[inline(always)]
-    fn try_deserialize_from_ecc_format<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
-        let mut u = [0u8; 32];
-        reader.read_exact(&mut u)?;
-        Ok(unsafe {
-            AVX512GF2_128 {
-                v: transmute::<[u8; 16], __m128i>(u[..16].try_into().unwrap()),
-            }
-        })
-    }
-}
-
-impl Field for AVX512GF2_128 {
-    const NAME: &'static str = "Galios Field 2^128";
-
-    const SIZE: usize = 128 / 8;
-
-    const FIELD_SIZE: usize = 128; // in bits
-
-    const ZERO: Self = AVX512GF2_128 {
-        v: unsafe { std::mem::zeroed() },
-    };
-
-    const ONE: Self = AVX512GF2_128 {
-        v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) },
-    };
-
-    const INV_2: Self = AVX512GF2_128 {
-        v: unsafe { std::mem::zeroed() },
-    }; // should not be used
-
-    #[inline(always)]
-    fn zero() -> Self {
-        AVX512GF2_128 {
-            v: unsafe { std::mem::zeroed() },
-        }
-    }
-
-    #[inline(always)]
-    fn one() -> Self {
-        AVX512GF2_128 {
-            // 1 in the first bit
-            v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([1, 0, 0, 0]) }, // TODO check bit order
-        }
-    }
-
-    #[inline(always)]
-    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
-        let mut u = [0u8; 16];
-        rng.fill_bytes(&mut u);
-        unsafe {
-            AVX512GF2_128 {
-                v: *(u.as_ptr() as *const __m128i),
-            }
-        }
-    }
-
-    #[inline(always)]
-    fn random_bool(mut rng: impl rand::RngCore) -> Self {
-        AVX512GF2_128 {
-            v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([rng.next_u32() % 2, 0, 0, 0]) },
-        }
-    }
-
-    #[inline(always)]
-    fn is_zero(&self) -> bool {
-        unsafe { std::mem::transmute::<__m128i, [u8; 16]>(self.v) == [0; 16] }
-    }
-
-    #[inline(always)]
-    fn exp(&self, exponent: u128) -> Self {
-        let mut e = exponent;
-        let mut res = Self::one();
-        let mut t = *self;
-        while e > 0 {
-            if e & 1 == 1 {
-                res *= t;
-            }
-            t = t * t;
-            e >>= 1;
-        }
-        res
-    }
-
-    #[inline(always)]
-    fn inv(&self) -> Option<Self> {
-        if self.is_zero() {
-            return None;
-        }
-        let p_m2 = !(0u128) - 1;
-        Some(Self::exp(self, p_m2))
-    }
-
-    #[inline(always)]
-    fn square(&self) -> Self {
-        self * self
-    }
-
-    #[inline(always)]
-    fn as_u32_unchecked(&self) -> u32 {
-        unimplemented!("u32 for GF128 doesn't make sense")
-    }
-
-    #[inline(always)]
-    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
-        unsafe {
-            AVX512GF2_128 {
-                v: transmute::<[u8; 16], __m128i>(bytes[..16].try_into().unwrap()),
-            }
-        }
-    }
-}
-
-impl ExtensionField for AVX512GF2_128 {
-    const DEGREE: usize = 128;
-
-    const W: u32 = 0x87;
-
-    const X: Self = AVX512GF2_128 {
-        v: unsafe { std::mem::transmute::<[i32; 4], __m128i>([2, 0, 0, 0]) },
-    };
-
-    type BaseField = GF2;
-
-    #[inline(always)]
-    fn mul_by_base_field(&self, base: &Self::BaseField) -> Self {
-        if base.v == 0 {
-            Self::zero()
-        } else {
-            *self
-        }
-    }
-
-    #[inline(always)]
-    fn add_by_base_field(&self, base: &Self::BaseField) -> Self {
-        let mut res = *self;
-        res.v = unsafe { _mm_xor_si128(res.v, _mm_set_epi64x(0, base.v as i64)) };
-        res
-    }
-
-    #[inline]
-    fn mul_by_x(&self) -> Self {
-        unsafe {
-            // Shift left by 1 bit
-            let shifted = _mm_slli_epi64(self.v, 1);
-
-            // Get the most significant bit and move it
-            let msb = _mm_srli_epi64(self.v, 63);
-            let msb_moved = _mm_slli_si128(msb, 8);
-
-            // Combine the shifted value with the moved msb
-            let shifted_consolidated = _mm_or_si128(shifted, msb_moved);
-
-            // Create the reduction value (0x87) and the comparison value (1)
-            let reduction = {
-                let multiplier = _mm_set_epi64x(0, 0x87);
-                let one = _mm_set_epi64x(0, 1);
-
-                // Check if the MSB was 1 and create a mask
-                let mask = _mm_cmpeq_epi64(_mm_srli_si128(msb, 8), one);
-
-                _mm_and_si128(mask, multiplier)
-            };
-
-            // Apply the reduction conditionally
-            let res = _mm_xor_si128(shifted_consolidated, reduction);
-
-            Self { v: res }
-        }
-    }
-}
-
-impl From<GF2> for AVX512GF2_128 {
-    #[inline(always)]
-    fn from(v: GF2) -> Self {
-        AVX512GF2_128 {
-            v: unsafe { _mm_set_epi64x(0, v.v as i64) },
-        }
-    }
-}
-
-#[inline]
-unsafe fn gfmul(a: __m128i, b: __m128i) -> __m128i {
-    let xmm_mask = _mm_setr_epi32((0xffffffff_u32) as i32, 0x0, 0x0, 0x0);
-
-    // a = a0|a1, b = b0|b1
-
-    let mut tmp3 = _mm_clmulepi64_si128(a, b, 0x00); // tmp3 = a0 * b0
-    let mut tmp6 = _mm_clmulepi64_si128(a, b, 0x11); // tmp6 = a1 * b1
-
-    let mut tmp4 = _mm_shuffle_epi32(a, 78); // tmp4 = a1|a0
-    let mut tmp5 = _mm_shuffle_epi32(b, 78); // tmp5 = b1|b0
-    tmp4 = _mm_xor_si128(tmp4, a); // tmp4 = (a0 + a1) | (a0 + a1)
-    tmp5 = _mm_xor_si128(tmp5, b); // tmp5 = (b0 + b1) | (b0 + b1)
-
-    tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); // tmp4 = (a0 + a1) * (b0 + b1)
-    tmp4 = _mm_xor_si128(tmp4, tmp3); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0
-    tmp4 = _mm_xor_si128(tmp4, tmp6); // tmp4 = (a0 + a1) * (b0 + b1) - a0 * b0 - a1 * b1 = a0 * b1 + a1 * b0
-
-    let tmp5_shifted_left = _mm_slli_si128(tmp4, 8);
-    tmp4 = _mm_srli_si128(tmp4, 8);
-    tmp3 = _mm_xor_si128(tmp3, tmp5_shifted_left);
-    tmp6 = _mm_xor_si128(tmp6, tmp4);
-
-    let mut tmp7 = _mm_srli_epi32(tmp6, 31);
-    let mut tmp8 = _mm_srli_epi32(tmp6, 30);
-    let tmp9 = _mm_srli_epi32(tmp6, 25);
-
-    tmp7 = _mm_xor_si128(tmp7, tmp8);
-    tmp7 = _mm_xor_si128(tmp7, tmp9);
-
-    tmp8 = _mm_shuffle_epi32(tmp7, 147);
-    tmp7 = _mm_and_si128(xmm_mask, tmp8);
-    tmp8 = _mm_andnot_si128(xmm_mask, tmp8);
-
-    tmp3 = _mm_xor_si128(tmp3, tmp8);
-    tmp6 = _mm_xor_si128(tmp6, tmp7);
-
-    let tmp10 = _mm_slli_epi32(tmp6, 1);
-    tmp3 = _mm_xor_si128(tmp3, tmp10);
-
-    let tmp11 = _mm_slli_epi32(tmp6, 2);
-    tmp3 = _mm_xor_si128(tmp3, tmp11);
-
-    let tmp12 = _mm_slli_epi32(tmp6, 7);
-    tmp3 = _mm_xor_si128(tmp3, tmp12);
-
-    _mm_xor_si128(tmp3, tmp6)
-}
-
-impl Default for AVX512GF2_128 {
-    #[inline(always)]
-    fn default() -> Self {
-        Self::zero()
-    }
-}
-
-impl PartialEq for AVX512GF2_128 {
-    #[inline(always)]
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { _mm_test_all_ones(_mm_cmpeq_epi8(self.v, other.v)) == 1 }
-    }
-}
-
-impl Neg for AVX512GF2_128 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn neg(self) -> Self {
-        self
-    }
-}
-
-impl From<u32> for AVX512GF2_128 {
-    #[inline(always)]
-    fn from(v: u32) -> Self {
-        AVX512GF2_128 {
-            v: unsafe { std::mem::transmute::<[u32; 4], __m128i>([v, 0, 0, 0]) },
-        }
-    }
-}
-
-#[inline(always)]
-fn add_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
-        v: unsafe { _mm_xor_si128(a.v, b.v) },
-    }
-}
-
-#[inline(always)]
-fn sub_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
-        v: unsafe { _mm_xor_si128(a.v, b.v) },
-    }
-}
-
-#[inline(always)]
-fn mul_internal(a: &AVX512GF2_128, b: &AVX512GF2_128) -> AVX512GF2_128 {
-    AVX512GF2_128 {
-        v: unsafe { gfmul(a.v, b.v) },
-    }
-}

From 8fc7519d4a15dad0321509ed7f521b59d108d405 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 10:21:18 -0400
Subject: [PATCH 5/7] system test

---
 tests/system.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 tests/system.rs

diff --git a/tests/system.rs b/tests/system.rs
new file mode 100644
index 00000000..9467dc54
--- /dev/null
+++ b/tests/system.rs
@@ -0,0 +1,21 @@
+#[test]
+fn test_mutually_exclusive_flags() {
+    let mut enabled_ctr = 0;
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        enabled_ctr += 1;
+    }
+
+    #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+    {
+        enabled_ctr += 1;
+    }
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+    {
+        enabled_ctr += 1;
+    }
+
+    assert_eq!(enabled_ctr, 1);
+}

From 83845aeeb97ed588c6165968904ed3612ba720c5 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 10:40:17 -0400
Subject: [PATCH 6/7] fix

---
 arith/benches/ext_field.rs                    |  4 ----
 arith/benches/field.rs                        |  6 ------
 arith/src/field/m31.rs                        | 20 ++++++++++++-------
 arith/src/field/m31/m31x16.rs                 | 12 -----------
 .../m31/{m31_avx256.rs => m31x16_avx256.rs}   |  0
 .../m31/{m31_avx512.rs => m31x16_avx512.rs}   |  0
 .../field/m31/{m31_neon.rs => m31x16_neon.rs} |  0
 7 files changed, 13 insertions(+), 29 deletions(-)
 delete mode 100644 arith/src/field/m31/m31x16.rs
 rename arith/src/field/m31/{m31_avx256.rs => m31x16_avx256.rs} (100%)
 rename arith/src/field/m31/{m31_avx512.rs => m31x16_avx512.rs} (100%)
 rename arith/src/field/m31/{m31_neon.rs => m31x16_neon.rs} (100%)

diff --git a/arith/benches/ext_field.rs b/arith/benches/ext_field.rs
index 681ed116..05e6d281 100644
--- a/arith/benches/ext_field.rs
+++ b/arith/benches/ext_field.rs
@@ -1,5 +1,3 @@
-#[cfg(target_arch = "x86_64")]
-use arith::GF2_128x8_256;
 use arith::{ExtensionField, Field, GF2_128x8, M31Ext3, M31Ext3x16, GF2_128};
 use ark_std::test_rng;
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
@@ -156,8 +154,6 @@ fn ext_by_base_benchmark(c: &mut Criterion) {
     bench_field::<M31Ext3x16>(c);
     bench_field::<GF2_128>(c);
     bench_field::<GF2_128x8>(c);
-    #[cfg(target_arch = "x86_64")]
-    bench_field::<GF2_128x8_256>(c);
 }
 
 criterion_group!(ext_by_base_benches, ext_by_base_benchmark);
diff --git a/arith/benches/field.rs b/arith/benches/field.rs
index 5369b85e..c2904d39 100644
--- a/arith/benches/field.rs
+++ b/arith/benches/field.rs
@@ -1,8 +1,6 @@
 // this module benchmarks the performance of different field operations
 
 use arith::{Field, GF2_128x8, GF2x8, M31Ext3, M31Ext3x16, M31x16, GF2, GF2_128, M31};
-#[cfg(target_arch = "x86_64")]
-use arith::{GF2_128x8_256, M31x16_256};
 use ark_std::test_rng;
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
 use halo2curves::bn256::Fr;
@@ -176,8 +174,6 @@ pub(crate) fn bench_field<F: Field>(c: &mut Criterion) {
 fn criterion_benchmark(c: &mut Criterion) {
     bench_field::<M31>(c);
     bench_field::<M31x16>(c);
-    #[cfg(target_arch = "x86_64")]
-    bench_field::<M31x16_256>(c);
     bench_field::<M31Ext3>(c);
     bench_field::<M31Ext3x16>(c);
     bench_field::<Fr>(c);
@@ -185,8 +181,6 @@ fn criterion_benchmark(c: &mut Criterion) {
     bench_field::<GF2x8>(c);
     bench_field::<GF2_128>(c);
     bench_field::<GF2_128x8>(c);
-    #[cfg(target_arch = "x86_64")]
-    bench_field::<GF2_128x8_256>(c);
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/arith/src/field/m31.rs b/arith/src/field/m31.rs
index c263b955..87a2c0ca 100644
--- a/arith/src/field/m31.rs
+++ b/arith/src/field/m31.rs
@@ -1,14 +1,20 @@
-mod m31x16;
-pub use m31x16::M31x16;
+#[cfg(target_arch = "aarch64")]
+pub mod m31x16_neon;
+#[cfg(target_arch = "aarch64")]
+pub type M31x16 = m31x16_neon::NeonM31;
 
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-pub(crate) mod m31_avx256;
 
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub(crate) mod m31_avx512;
+pub mod m31x16_avx512;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub type M31x16 = m31x16_avx512::AVXM31;
+
+
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub mod m31x16_avx256;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub type M31x16 = m31x16_avx256::AVXM31;
 
-#[cfg(target_arch = "aarch64")]
-pub mod m31_neon;
 
 use rand::RngCore;
 
diff --git a/arith/src/field/m31/m31x16.rs b/arith/src/field/m31/m31x16.rs
deleted file mode 100644
index b111b6e7..00000000
--- a/arith/src/field/m31/m31x16.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-// A M31x16 stores 512 bits of data.
-// With AVX it stores a single __m512i element.
-// With NEON it stores four uint32x4_t elements.
-
-#[cfg(target_arch = "aarch64")]
-pub type M31x16 = super::m31_neon::NeonM31;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub type M31x16 = super::m31_avx512::AVXM31;
-
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
-pub type M31x16 = super::m31_avx256::AVXM31;
diff --git a/arith/src/field/m31/m31_avx256.rs b/arith/src/field/m31/m31x16_avx256.rs
similarity index 100%
rename from arith/src/field/m31/m31_avx256.rs
rename to arith/src/field/m31/m31x16_avx256.rs
diff --git a/arith/src/field/m31/m31_avx512.rs b/arith/src/field/m31/m31x16_avx512.rs
similarity index 100%
rename from arith/src/field/m31/m31_avx512.rs
rename to arith/src/field/m31/m31x16_avx512.rs
diff --git a/arith/src/field/m31/m31_neon.rs b/arith/src/field/m31/m31x16_neon.rs
similarity index 100%
rename from arith/src/field/m31/m31_neon.rs
rename to arith/src/field/m31/m31x16_neon.rs

From 6bc1347f3c1516418177aeb66b08aab26e9e7472 Mon Sep 17 00:00:00 2001
From: zhenfei <zhenfei.zhang@hotmail.com>
Date: Thu, 26 Sep 2024 11:04:49 -0400
Subject: [PATCH 7/7] Update m31.rs

---
 arith/src/field/m31.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arith/src/field/m31.rs b/arith/src/field/m31.rs
index 87a2c0ca..6ab8191a 100644
--- a/arith/src/field/m31.rs
+++ b/arith/src/field/m31.rs
@@ -3,19 +3,16 @@ pub mod m31x16_neon;
 #[cfg(target_arch = "aarch64")]
 pub type M31x16 = m31x16_neon::NeonM31;
 
-
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 pub mod m31x16_avx512;
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 pub type M31x16 = m31x16_avx512::AVXM31;
 
-
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
 pub mod m31x16_avx256;
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
 pub type M31x16 = m31x16_avx256::AVXM31;
 
-
 use rand::RngCore;
 
 use crate::{field_common, Field, FieldForECC, FieldSerde, FieldSerdeResult};