Add fp16 vector add asm examples

ROCm · Aug 5, 2016 · c00bbe4 · c00bbe4
1 parent a842dbc
commit c00bbe4
Show file tree

Hide file tree

Showing 7 changed files with 534 additions and 0 deletions.
diff --git a/examples/common/half.cpp b/examples/common/half.cpp
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Naive implemantation of float <-> half conversion.
+// inf, nan and denorms are not supported
+
+#include "half.h"
+
+uint16_t f2h(float f)
+{
+  if (f == 0.0f)
+    return 0;
+
+  uint32_t t = reinterpret_cast<uint32_t&>(f);
+  t = ((t>>16) & 0x8000) | ((t>>13) & 0x03ff) | ((((t&0x7f800000)-0x38000000)>>13) & 0x7c00);
+
+  return t & 0xffff;
+}
+
+float h2f(uint16_t h)
+{
+  if (!(h & 0x7fff))
+    return 0.0f;
+  float f;
+  uint32_t sign = h & 0x8000;
+  uint32_t t = h;
+  t = ((t & 0x7c00) + 0x1c000) | (t & 0x3ff);
+  reinterpret_cast<uint32_t&>(f) = sign << 16 | t << 13;
+  return f;
+}
+
diff --git a/examples/common/half.h b/examples/common/half.h
@@ -0,0 +1,51 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HALF_H__
+#define HALF_H__
+
+#include "stdint.h"
+
+uint16_t f2h(float f);
+float h2f(uint16_t h);
+
+#endif // HALF_H__
diff --git a/examples/gfx8/CMakeLists.txt b/examples/gfx8/CMakeLists.txt
@@ -43,4 +43,6 @@
 asm_dispatch_example(ds_bpermute)
 asm_dispatch_example(dpp_reduce)
 asm_dispatch_example(s_memrealtime)
+asm_dispatch_example(fp16_storage)
+asm_dispatch_example(fp16_native)
 inline_asm_dispatch_example(s_memrealtime_inline)
diff --git a/examples/gfx8/fp16_native.cpp b/examples/gfx8/fp16_native.cpp
@@ -0,0 +1,102 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "dispatch.hpp"
+#include "half.h"
+
+using namespace amd::dispatch;
+
+class HalfVectorAdd : public Dispatch {
+private:
+  Buffer* in1;
+  Buffer* in2;
+  Buffer* out;
+  unsigned length;
+
+public:
+  HalfVectorAdd(int argc, const char **argv)
+    : Dispatch(argc, argv), length(64) { }
+
+  bool SetupCodeObject() override {
+    return LoadCodeObjectFromFile("fp16_native.co");
+  }
+
+  bool Setup() override {
+    if (!AllocateKernarg(3 * sizeof(Buffer*))) { return false; }
+    in1 = AllocateBuffer(length * sizeof(float) / 2);
+    in2 = AllocateBuffer(length * sizeof(float) / 2);
+    for (unsigned i = 0; i < length; ++i) {
+      in1->Data<uint16_t>(i) = f2h(i);
+      in2->Data<uint16_t>(i) = f2h(i * 1.25f);
+    }
+    if (!CopyTo(in1)) { output << "Error: failed to copy to local" << std::endl; return false; }
+    if (!CopyTo(in2)) { output << "Error: failed to copy to local" << std::endl; return false; }
+    out = AllocateBuffer(length * sizeof(float)/2);
+    Kernarg(in1);
+    Kernarg(in2);
+    Kernarg(out);
+    SetGridSize(64);
+    SetWorkgroupSize(64);
+    return true;
+  }
+
+  bool Verify() override {
+    if (!CopyFrom(out)) { output << "Error: failed to copy from local" << std::endl; return false; }
+    bool ok = true;
+    for (unsigned i = 0; i < length; ++i) {
+      float f1 = h2f(in1->Data<uint16_t>(i));
+      float f2 = h2f(in2->Data<uint16_t>(i));
+      float res = h2f(out->Data<uint16_t>(i));
+      float expected = h2f(f2h(f1 + f2));
+      if (expected != res){
+        output << "Error: validation failed at " << i << ": got " << res << " expected " << expected << std::endl;
+        ok = false;
+      }
+    }
+    return ok;
+  }
+};
+
+int main(int argc, const char** argv)
+{
+  return HalfVectorAdd(argc, argv).RunMain();
+}
diff --git a/examples/gfx8/fp16_native.s b/examples/gfx8/fp16_native.s
@@ -0,0 +1,100 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// Vector add example using fp16 storage data type and fp16 add instruction
+//
+
+.hsa_code_object_version 2,0
+.hsa_code_object_isa 8, 0, 3, "AMD", "AMDGPU"
+
+.text
+.p2align 8
+.amdgpu_hsa_kernel hello_world
+
+hello_world:
+
+   .amd_kernel_code_t
+      enable_sgpr_kernarg_segment_ptr = 1
+      is_ptr64 = 1
+      compute_pgm_rsrc1_vgprs = 0
+      compute_pgm_rsrc1_sgprs = 0
+      compute_pgm_rsrc2_user_sgpr = 2
+      kernarg_segment_byte_size = 24
+      wavefront_sgpr_count = 8
+      workitem_vgpr_count = 4
+  .end_amd_kernel_code_t
+
+  // read kernel arguments:
+  // s[0:1] = half *in1
+  // s[2:3] = half *in2
+  // s[4:5] = half *out
+  s_load_dwordx2 s[4:5], s[0:1], 0x10
+  s_load_dwordx4 s[0:3], s[0:1], 0x00
+
+  v_lshlrev_b32 v0, 1, v0
+  s_waitcnt     0
+
+  // v[1:2] = &in1[i]
+  v_add_u32     v1, vcc, s0, v0
+  v_mov_b32     v2, s1
+  v_addc_u32    v2, vcc, v2, 0, vcc
+  flat_load_ushort v3, v[1:2] // v3 = in1[i]
+
+  // v[1:2] = &in2[i]
+  v_add_u32     v1, vcc, s2, v0
+  v_mov_b32     v2, s3
+  v_addc_u32    v2, vcc, v2, 0, vcc  
+  flat_load_ushort v2, v[1:2] // v2 = in2[i]
+
+  // v[0:1] = &out[i]
+  v_add_u32     v0, vcc, s4, v0
+  v_mov_b32     v1, s5
+  v_addc_u32    v1, vcc, v1, 0, vcc
+
+  // wait for memory operations to complete
+  s_waitcnt     0
+
+  v_add_f16     v3, v3, v2 // v3 = in1[i] + in2[i]
+
+  flat_store_short v[0:1], v3 // out[i] = v3
+  s_endpgm