Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. #111538

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

Rin18
Copy link
Contributor

@Rin18 Rin18 commented Oct 8, 2024

Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 8, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Rin Dobrescu (Rin18)

Changes

Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.


Patch is 108.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111538.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td (+153-50)
  • (added) llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s (+1421)
  • (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s (+69-69)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index f7e6545f0dd386..1d7cb699f731aa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -469,6 +469,87 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
                                             V1UnitV, V1UnitV, V1UnitV,
                                             V1UnitV, V1UnitV, V1UnitV]>;
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V1WriteIM : SchedWriteVariant<
+                  [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
+                   SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;
+
+def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
+
+def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;
+
+def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;
+
+def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;
+
+def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;
+
+def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;
+
+def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;
+
+def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;
+
+def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;
+
+def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;
+
+def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;
+
+def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;
+
+def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;
+
+def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
+def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;
+
+def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;
+
+def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;
+
+def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
+def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;
+
+def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;
+
+let Latency = 5, NumMicroOps = 2 in
+def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
+def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;
+
+def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;
+
+def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;
+
+def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
+def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
+def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;
 
 // Miscellaneous Instructions
 // -----------------------------------------------------------------------------
@@ -553,16 +634,22 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
 def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
 def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
 
+def           : SchedAlias<WriteIM32, V1Write_2c_1M>;
+def           : SchedAlias<WriteIM64, V1Write_2c_1M>;
+
 // Multiply
-// Multiply accumulate
-// Multiply accumulate, long
-// Multiply long
-def V1WriteIM : SchedWriteVariant<
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+/*def V1WriteIM : SchedWriteVariant<
                   [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
                    SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
-def           : SchedAlias<WriteIM32, V1WriteIM>;
-def           : SchedAlias<WriteIM64, V1WriteIM>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;*/
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA], (instregex "^M(ADD|SUB)[WX]rrr$")>;
 
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA],
+             (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
 // Multiply high
 def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
 
@@ -680,10 +767,10 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
 def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
 
 // FP multiply
-def : SchedAlias<WriteFMul, V1Write_3c_1V>;
+def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
 
 // FP round to integral
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
 // ASIMD absolute diff accum
 // ASIMD absolute diff accum long
 // ASIMD pairwise add and accumulate long
-def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
+def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD max/min, reduce, 4H/4S
@@ -843,23 +930,25 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
 
-// ASIMD matrix multiply- accumulate
-def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD multiply
+def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
 // ASIMD multiply accumulate
+def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
 // ASIMD multiply accumulate long
+def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
 // ASIMD multiply accumulate high
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
 // ASIMD multiply accumulate saturating long
-def : InstRW<[V1Write_4c_1V02], 
-             (instregex "^MUL(v[148]i16|v[124]i32)$",
-                        "^SQR?DMULH(v[48]i16|v[24]i32)$",
-                        "^ML[AS](v[148]i16|v[124]i32)$",
-                        "^[SU]ML[AS]Lv",
-                        "^SQRDML[AS]H(v[148]i16|v[124]i32)$",
-                        "^SQDML[AS]Lv")>;
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial
 def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
@@ -868,11 +957,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
 def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;
 
 // ASIMD shift accumulate
+def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
+
 // ASIMD shift by immed, complex
 // ASIMD shift by register, complex
 def : InstRW<[V1Write_4c_1V13],
-             (instregex "^[SU]R?SRAv",
-                        "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
+             (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
                         "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
                         "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", 
                         "^[SU]Q?RSHLv", "^[SU]QSHLv")>;
@@ -890,16 +980,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
 // ASIMD FP absolute value/difference
 // ASIMD FP arith, normal
 // ASIMD FP compare
-// ASIMD FP complex add
 // ASIMD FP max/min, normal
 // ASIMD FP max/min, pairwise
 // ASIMD FP negate
 // Covered by "SchedAlias (WriteV[dq]...)" above
 
+// ASIMD FP complex add
+def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;
+
 // ASIMD FP complex multiply add
+def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP multiply
+def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;
+
 // ASIMD FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
-                                         "^FML[AS]v")>;
+def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;
 
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
@@ -953,12 +1052,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
 // ASIMD FP max/min, reduce, Q-form F16
 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
 
-// ASIMD FP multiply
-def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;
-
-// ASIMD FP multiply accumulate long
-def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
-
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
 
@@ -976,13 +1069,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
 def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;
 
 // ASIMD dot product
-def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
+def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
 
 // ASIMD matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
+def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;
 
 // Scalar convert, F32 to BF16
 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
@@ -1300,7 +1393,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
 
 
 // SVE Predicate instructions
@@ -1440,13 +1533,13 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
                                              "^[SU]DIV_ZPZZ_D")>;
 
 // Dot product, 8 bit
-def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
+def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
+def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
 
 // Dot product, 16 bit
-def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
+def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
 
 // Duplicate, immediate and indexed form
 def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
@@ -1488,7 +1581,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
                                            "^MOVPRFX_ZZ$")>;
 
 // Matrix multiply-accumulate
-def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 
 // Multiply, B, H, S element size
 def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
@@ -1497,12 +1590,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
                                           "^[SU]MULH_ZPZZ_[BHS]")>;
 
 // Multiply, D element size
-// Multiply accumulate, D element size
 def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
                                           "^MUL_ZPZZ_D",
                                           "^[SU]MULH_(ZPmZ|ZZZ)_D",
-                                          "^[SU]MULH_ZPZZ_D",
-                                          "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
+                                          "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
+             (instregex "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
 
 // Multiply accumulate, B, H, S element size
 // NOTE: This is not specified in the SOG.
@@ -1583,8 +1680,10 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                           "^FCMLA_ZZZI_[HS]$")>;
+/*def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
+                                           "^FCMLA_ZZZI_[HS]$")>;*/
+def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
 
 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
 // Floating point convert to integer, F32
@@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
                                            "^FMUL_ZPZ[IZ]_[HSD]")>;
 
 // Floating point multiply accumulate
+def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
 // Floating point reciprocal step
-def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
-                                           "^FN?ML[AS]_ZPZZZ_[HSD]",
-                                           "^FML[AS]_ZZZI_[HSD]$",
-                                           "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
+def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
 
 // Floating point reciprocal estimate, F16
 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
@@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
 def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
 
 // Dot product
-def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
 
 // Matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
 
 // Multiply accumulate long
-def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
 
 
 // SVE Load instructions
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
new file mode 100644
index 00000000000000..4de37f96000520
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -0,0 +1,1421 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul    v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul  v0.4s, v0.4s,  v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul   v0.4s, v0.4s,  v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul  v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul  v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul  v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul   v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32cb
+mul    w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul z0.d, p0/m, z0.d, z0.d
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, p0/m, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mad.d
+mul z0.d, p0/m, z0.d, z0.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z msb.d
+mul z0.d, p0/m, z0.d, z0.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul  z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul  z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2....
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants