From c16bae32b5e6963d0efa8d3674b2501b5ec6f266 Mon Sep 17 00:00:00 2001
From: Andrzej Janik <vosen@vosen.pl>
Date: Wed, 21 Aug 2024 03:38:43 +0200
Subject: [PATCH] Add rcp, sqrt, rsqrt

---
 ptx_parser/src/ast.rs  |  50 ++++++++++++++++++++
 ptx_parser/src/main.rs | 101 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+)
diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs
index 4251d97d..944341c1 100644
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@@ -258,6 +258,30 @@ gen::generate_instruction_type!(
                 src2: T,
             }
         },
+        Rcp {
+            type: { Type::from(data.type_) },
+            data: RcpData,
+            arguments<T>: {
+                dst: T,
+                src: T,
+            }
+        },
+        Sqrt {
+            type: { Type::from(data.type_) },
+            data: RcpData,
+            arguments<T>: {
+                dst: T,
+                src: T,
+            }
+        },
+        Rsqrt {
+            type: { Type::from(data.type_) },
+            data: RsqrtData,
+            arguments<T>: {
+                dst: T,
+                src: T,
+            }
+        },
         Trap { }
     }
 );
@@ -1117,3 +1141,29 @@ pub struct MinMaxFloat {
     pub nan: bool,
     pub type_: ScalarType,
 }
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub enum DivFloatKind {
+    Approx,
+    Full,
+    Rounding(RoundingMode),
+}
+
+#[derive(Copy, Clone)]
+pub struct RcpData {
+    pub kind: RcpKind,
+    pub flush_to_zero: Option<bool>,
+    pub type_: ScalarType,
+}
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub enum RcpKind {
+    Approx,
+    Full(RoundingMode),
+}
+
+#[derive(Copy, Clone)]
+pub struct RsqrtData {
+    pub flush_to_zero: Option<bool>,
+    pub type_: ScalarType,
+}
diff --git a/ptx_parser/src/main.rs b/ptx_parser/src/main.rs
index 71d8dced..159b918e 100644
--- a/ptx_parser/src/main.rs
+++ b/ptx_parser/src/main.rs
@@ -2244,6 +2244,107 @@ derive_parser!(
     }
     ScalarType = { .f16, .f16x2, .bf16, .bf16x2 };
 
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+    rcp.approx{.ftz}.type   d, a => {
+        ast::Instruction::Rcp {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Approx,
+                flush_to_zero: Some(ftz),
+                type_
+            },
+            arguments: RcpArgs { dst: d, src: a  }
+        }
+    }
+    rcp.rnd{.ftz}.f32       d, a => {
+        ast::Instruction::Rcp {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Full(rnd.into()),
+                flush_to_zero: Some(ftz),
+                type_: f32
+            },
+            arguments: RcpArgs { dst: d, src: a  }
+        }
+    }
+    rcp.rnd.f64             d, a => {
+        ast::Instruction::Rcp {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Full(rnd.into()),
+                flush_to_zero: None,
+                type_: f64
+            },
+            arguments: RcpArgs { dst: d, src: a  }
+        }
+    }
+    .type: ScalarType =        { .f32, .f64 };
+    .rnd: RawRoundingMode = { .rn, .rz, .rm, .rp };
+    ScalarType =        { .f32, .f64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+    sqrt.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Sqrt {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Approx,
+                flush_to_zero: Some(ftz),
+                type_: f32
+            },
+            arguments: SqrtArgs { dst: d, src: a  }
+        }
+    }
+    sqrt.rnd{.ftz}.f32     d, a => {
+        ast::Instruction::Sqrt {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Full(rnd.into()),
+                flush_to_zero: Some(ftz),
+                type_: f32
+            },
+            arguments: SqrtArgs { dst: d, src: a  }
+        }
+    }
+    sqrt.rnd.f64           d, a => {
+        ast::Instruction::Sqrt {
+            data: ast::RcpData {
+                kind: ast::RcpKind::Full(rnd.into()),
+                flush_to_zero: None,
+                type_: f64
+            },
+            arguments: SqrtArgs { dst: d, src: a  }
+        }
+    }
+    .rnd: RawRoundingMode = { .rn, .rz, .rm, .rp };
+    ScalarType =        { .f32, .f64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+    rsqrt.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Rsqrt {
+            data: ast::RsqrtData {
+                flush_to_zero: Some(ftz),
+                type_: f32
+            },
+            arguments: RsqrtArgs { dst: d, src: a  }
+        }
+    }
+    rsqrt.approx.f64        d, a => {
+        ast::Instruction::Rsqrt {
+            data: ast::RsqrtData {
+                flush_to_zero: None,
+                type_: f64
+            },
+            arguments: RsqrtArgs { dst: d, src: a  }
+        }
+    }
+    rsqrt.approx.ftz.f64 d, a => {
+        ast::Instruction::Rsqrt {
+            data: ast::RsqrtData {
+                flush_to_zero: None,
+                type_: f64
+            },
+            arguments: RsqrtArgs { dst: d, src: a  }
+        }
+    }
+    ScalarType =        { .f32, .f64 };
+
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
     ret{.uni} => {
         Instruction::Ret { data: RetData { uniform: uni } }