Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
Add fp16 vector add asm examples
Browse files Browse the repository at this point in the history
  • Loading branch information
Kirpich30000 committed Aug 5, 2016
1 parent a842dbc commit c00bbe4
Show file tree
Hide file tree
Showing 7 changed files with 534 additions and 0 deletions.
70 changes: 70 additions & 0 deletions examples/common/half.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

// Naive implemantation of float <-> half conversion.
// inf, nan and denorms are not supported

#include "half.h"

uint16_t f2h(float f)
{
if (f == 0.0f)
return 0;

uint32_t t = reinterpret_cast<uint32_t&>(f);
t = ((t>>16) & 0x8000) | ((t>>13) & 0x03ff) | ((((t&0x7f800000)-0x38000000)>>13) & 0x7c00);

return t & 0xffff;
}

float h2f(uint16_t h)
{
if (!(h & 0x7fff))
return 0.0f;
float f;
uint32_t sign = h & 0x8000;
uint32_t t = h;
t = ((t & 0x7c00) + 0x1c000) | (t & 0x3ff);
reinterpret_cast<uint32_t&>(f) = sign << 16 | t << 13;
return f;
}

51 changes: 51 additions & 0 deletions examples/common/half.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

#ifndef HALF_H__
#define HALF_H__

#include "stdint.h"

uint16_t f2h(float f);
float h2f(uint16_t h);

#endif // HALF_H__
2 changes: 2 additions & 0 deletions examples/gfx8/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@
asm_dispatch_example(ds_bpermute)
asm_dispatch_example(dpp_reduce)
asm_dispatch_example(s_memrealtime)
asm_dispatch_example(fp16_storage)
asm_dispatch_example(fp16_native)
inline_asm_dispatch_example(s_memrealtime_inline)
102 changes: 102 additions & 0 deletions examples/gfx8/fp16_native.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

#include "dispatch.hpp"
#include "half.h"

using namespace amd::dispatch;

class HalfVectorAdd : public Dispatch {
private:
Buffer* in1;
Buffer* in2;
Buffer* out;
unsigned length;

public:
HalfVectorAdd(int argc, const char **argv)
: Dispatch(argc, argv), length(64) { }

bool SetupCodeObject() override {
return LoadCodeObjectFromFile("fp16_native.co");
}

bool Setup() override {
if (!AllocateKernarg(3 * sizeof(Buffer*))) { return false; }
in1 = AllocateBuffer(length * sizeof(float) / 2);
in2 = AllocateBuffer(length * sizeof(float) / 2);
for (unsigned i = 0; i < length; ++i) {
in1->Data<uint16_t>(i) = f2h(i);
in2->Data<uint16_t>(i) = f2h(i * 1.25f);
}
if (!CopyTo(in1)) { output << "Error: failed to copy to local" << std::endl; return false; }
if (!CopyTo(in2)) { output << "Error: failed to copy to local" << std::endl; return false; }
out = AllocateBuffer(length * sizeof(float)/2);
Kernarg(in1);
Kernarg(in2);
Kernarg(out);
SetGridSize(64);
SetWorkgroupSize(64);
return true;
}

bool Verify() override {
if (!CopyFrom(out)) { output << "Error: failed to copy from local" << std::endl; return false; }
bool ok = true;
for (unsigned i = 0; i < length; ++i) {
float f1 = h2f(in1->Data<uint16_t>(i));
float f2 = h2f(in2->Data<uint16_t>(i));
float res = h2f(out->Data<uint16_t>(i));
float expected = h2f(f2h(f1 + f2));
if (expected != res){
output << "Error: validation failed at " << i << ": got " << res << " expected " << expected << std::endl;
ok = false;
}
}
return ok;
}
};

int main(int argc, const char** argv)
{
return HalfVectorAdd(argc, argv).RunMain();
}
100 changes: 100 additions & 0 deletions examples/gfx8/fp16_native.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2016, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

//
// Vector add example using fp16 storage data type and fp16 add instruction
//

.hsa_code_object_version 2,0
.hsa_code_object_isa 8, 0, 3, "AMD", "AMDGPU"

.text
.p2align 8
.amdgpu_hsa_kernel hello_world

hello_world:

.amd_kernel_code_t
enable_sgpr_kernarg_segment_ptr = 1
is_ptr64 = 1
compute_pgm_rsrc1_vgprs = 0
compute_pgm_rsrc1_sgprs = 0
compute_pgm_rsrc2_user_sgpr = 2
kernarg_segment_byte_size = 24
wavefront_sgpr_count = 8
workitem_vgpr_count = 4
.end_amd_kernel_code_t

// read kernel arguments:
// s[0:1] = half *in1
// s[2:3] = half *in2
// s[4:5] = half *out
s_load_dwordx2 s[4:5], s[0:1], 0x10
s_load_dwordx4 s[0:3], s[0:1], 0x00

v_lshlrev_b32 v0, 1, v0
s_waitcnt 0

// v[1:2] = &in1[i]
v_add_u32 v1, vcc, s0, v0
v_mov_b32 v2, s1
v_addc_u32 v2, vcc, v2, 0, vcc
flat_load_ushort v3, v[1:2] // v3 = in1[i]

// v[1:2] = &in2[i]
v_add_u32 v1, vcc, s2, v0
v_mov_b32 v2, s3
v_addc_u32 v2, vcc, v2, 0, vcc
flat_load_ushort v2, v[1:2] // v2 = in2[i]

// v[0:1] = &out[i]
v_add_u32 v0, vcc, s4, v0
v_mov_b32 v1, s5
v_addc_u32 v1, vcc, v1, 0, vcc

// wait for memory operations to complete
s_waitcnt 0

v_add_f16 v3, v3, v2 // v3 = in1[i] + in2[i]

flat_store_short v[0:1], v3 // out[i] = v3
s_endpgm
Loading

0 comments on commit c00bbe4

Please sign in to comment.