Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gemv with half precision cannot get correct result. #561

Open
liangzelang opened this issue Oct 11, 2024 · 1 comment
Open

Gemv with half precision cannot get correct result. #561

liangzelang opened this issue Oct 11, 2024 · 1 comment

Comments

@liangzelang
Copy link

liangzelang commented Oct 11, 2024

Through this Issue I selected the expected high-performance operator, but found a problem. If the data type is float, the calculation result is correct, but if the data type is half, the calculation result is incorrect.

// API test
#include <CL/cl.h>
#include <clblast.h>
#include <clblast_half.h>
#include <vector>
#include <chrono>
#include <iostream>

static void printDeviceInfo(cl_device_id device) {
    char buffer[1024];
    // device name
    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(buffer), buffer, nullptr);
    std::cout << "Device Name: " << buffer << std::endl;

    // device vendor
    clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(buffer), buffer, nullptr);
    std::cout << "Device Vendor: " << buffer << std::endl;

    // device version
    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(buffer), buffer, nullptr);
    std::cout << "Device Version: " << buffer << std::endl;

    // drvier version
    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(buffer), buffer, nullptr);
    std::cout << "Driver Version: " << buffer << std::endl;

    // OpenCL version
    clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(buffer), buffer, nullptr);
    std::cout << "OpenCL C Version: " << buffer << std::endl;
}

static void ConvertToHalf(const std::vector<float>& input, std::vector<cl_half>& output) {
    output.reserve(input.size());
    output.resize(input.size());
    for (size_t i = 0; i < input.size(); ++i) {
        output[i] = FloatToHalf(input[i]);
    }
}

static void ConvertToFloat(const std::vector<cl_half>& input, std::vector<float>& output) {
    output.reserve(input.size());
    output.resize(input.size());
    for (size_t i = 0; i < input.size(); ++i) {
        output[i] = HalfToFloat(input[i]);
    }
}

void test_gemm()
{

}

void test_gemv(cl_device_id device, cl_context context, cl_command_queue queue)
{
    const size_t m = 16384;
    const size_t n = 2048;

    std::vector<float> host_a(m * n, 1.1f);
    std::vector<float> host_x(16384, 1.0f);
    std::vector<float> host_y(2048, 0.0f);

    cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, host_a.size() * sizeof(float), host_a.data(), nullptr);
    cl_mem x_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, host_x.size() * sizeof(float), host_x.data(), nullptr);
    cl_mem y_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, host_y.size() * sizeof(float), nullptr, nullptr);
    clFinish(queue); // Ensure memory operations are complete

    using Parameters = std::unordered_map<std::string, size_t>;

    // params
    Parameters tuned_params = {
        {"WGS1", 64},
        {"WPT1", 1},
        {"VW2", 1},
        {"WGS2", 64},
        {"WPT2",1}
    };

    // use tuned params
    clblast::OverrideParameters(device, "XgemvFast", clblast::Precision::kSingle, tuned_params);

    // Performance measurement
    double totalTime = 0.0;
    for (int i = 0; i < 10; i++) {
        auto start = std::chrono::steady_clock::now();
        
        auto status = clblast::Gemv<float>(clblast::Layout::kRowMajor, clblast::Transpose::kYes, m, n, 
                        1.0f, 
                        a_buffer, 0, n, 
                        x_buffer, 0, 1, 
                        0.0f, 
                        y_buffer, 0, 1, 
                        &queue, nullptr);
        if (status != clblast::StatusCode::kSuccess) {
            std::cerr << "[TEST] Gemv error: " << static_cast<int>(status) << std::endl;
        }

        clFinish(queue);
        auto elapsed_time = std::chrono::duration<double,std::milli>(std::chrono::steady_clock::now() - start).count();
        std::cout << "No. "<< i << " GEMV execution time: " << elapsed_time << " ms" << std::endl;
        totalTime += elapsed_time;
    }

    double averageTime = totalTime / 10.0;
    std::cout << "GEMV execution time: " << averageTime << " ms" << std::endl;

    // Result
    clEnqueueReadBuffer(queue, y_buffer, CL_TRUE, 0, host_y.size() * sizeof(float), host_y.data(), 0, nullptr, nullptr);
    std::cout << "Result : ";
    for (int i = 0; i < 10; ++i) {
        std::cout << host_y[i] << " ";
    }
    std::cout << std::endl;

    // release 
    clReleaseMemObject(a_buffer);
    clReleaseMemObject(x_buffer);
    clReleaseMemObject(y_buffer);
}

void test_gemv_half(cl_device_id device, cl_context context, cl_command_queue queue)
{
    const size_t m = 16384;
    const size_t n = 2048;

    std::vector<float> host_a_float(m * n, 1.1f);
    std::vector<float> host_x_float(16384, 1.0f);
    std::vector<float> host_y_float(2048, 0.0f);

    std::vector<cl_half> host_a;
    std::vector<cl_half> host_x;
    std::vector<cl_half> host_y;

    ConvertToHalf(host_a_float, host_a);
    ConvertToHalf(host_x_float, host_x);
    host_y.resize(2048, FloatToHalf(0.0f));

    cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, host_a.size() * sizeof(cl_half), host_a.data(), nullptr);
    cl_mem x_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, host_x.size() * sizeof(cl_half), host_x.data(), nullptr);
    cl_mem y_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, host_y.size() * sizeof(cl_half), nullptr, nullptr);
    clFinish(queue); // Ensure memory operations are complete

    // params
    using Parameters = std::unordered_map<std::string, size_t>;
    Parameters tuned_params = {
        {"WGS1", 64},
        {"WPT1", 1},
        {"VW2", 1},
        {"WGS2", 64},
        {"WPT2",1}
    };

    // use tuned params
    // clblast::OverrideParameters(device, "XgemvFast", clblast::Precision::kHalf, tuned_params);

    // Performance measurement
    double totalTime = 0.0;
    for (int i = 0; i < 10; i++) {
        auto start = std::chrono::steady_clock::now();
        
        auto status = clblast::Gemv<cl_half>(clblast::Layout::kRowMajor, clblast::Transpose::kYes, m, n, 
                        FloatToHalf(1.0f), 
                        a_buffer, 0, n, 
                        x_buffer, 0, 1, 
                        FloatToHalf(0.0f), 
                        y_buffer, 0, 1, 
                        &queue, nullptr);
        if (status != clblast::StatusCode::kSuccess) {
            std::cerr << "[TEST] Gemv error: " << static_cast<int>(status) << std::endl;
        }

        clFinish(queue);
        auto elapsed_time = std::chrono::duration<double,std::milli>(std::chrono::steady_clock::now() - start).count();
        std::cout << "No. "<< i << " GEMV execution time: " << elapsed_time << " ms" << std::endl;
        totalTime += elapsed_time;
    }

    double averageTime = totalTime / 10.0;
    std::cout << "GEMV execution time: " << averageTime << " ms" << std::endl;

    // Result
    clEnqueueReadBuffer(queue, y_buffer, CL_TRUE, 0, host_y.size() * sizeof(cl_half), host_y.data(), 0, nullptr, nullptr);
    ConvertToFloat(host_y, host_y_float);

    std::cout << "Result : ";
    for (int i = 0; i < 10; ++i) {
        std::cout << host_y_float[i] << " ";
    }
    std::cout << std::endl;

    // release 
    clReleaseMemObject(a_buffer);
    clReleaseMemObject(x_buffer);
    clReleaseMemObject(y_buffer);
}

int main() {

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;

    clGetPlatformIDs(1, &platform, nullptr);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
    printDeviceInfo(device);
    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
    queue = clCreateCommandQueue(context, device, 0, nullptr);

    test_gemv(device, context, queue);
    test_gemv_half(device, context, queue);

    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}

And I got the output log with VERBOSE=ON compile option, the fp16 output is obviously incorrect. I suspect data overflow, but 18000 actually does not exceed the range of FP16, so do you have any ideas?

[DEBUG] Compiling routine 'GEMV-32 (single)'
[DEBUG] Completed compilation in 150.38 ms
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 13.41 ms
No. 0 GEMV execution time: 169.471 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.29 ms
No. 1 GEMV execution time: 4.36427 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.34 ms
No. 2 GEMV execution time: 4.44021 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.20 ms
No. 3 GEMV execution time: 4.30474 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.79 ms
No. 4 GEMV execution time: 5.02594 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.48 ms
No. 5 GEMV execution time: 4.85479 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.64 ms
No. 6 GEMV execution time: 4.93505 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.87 ms
No. 7 GEMV execution time: 5.31823 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 5.06 ms
No. 8 GEMV execution time: 5.23974 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_1_64_1_XgemvFastRot_8_32_32_TrsvRoutine_32
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.83 ms
No. 9 GEMV execution time: 5.00667 ms
GEMV execution time: 21.2961 ms
Result : 18019.4 18019.4 18019.4 18019.4 18019.4 18019.4 18019.4 18019.4 18019.4 18019.4
[DEBUG] Searching database for kernel 'Xgemv'
[DEBUG] Device type 'GPU'; vendor 'QUALCOMM'
[DEBUG] Device name 'QUALCOMM Adreno(TM) 750'; architecture 'OpenCL C 3.0 Adreno(TM) 750'
[DEBUG] Found architectures of vendor 'QUALCOMM' and type 'GPU'
[DEBUG] Found devices of architecture type 'default'
[DEBUG] Found parameters for device type 'default'
[DEBUG] Searching database for kernel 'XgemvFast'
[DEBUG] Device type 'GPU'; vendor 'QUALCOMM'
[DEBUG] Device name 'QUALCOMM Adreno(TM) 750'; architecture 'OpenCL C 3.0 Adreno(TM) 750'
[DEBUG] Found architectures of vendor 'QUALCOMM' and type 'GPU'
[DEBUG] Found devices of architecture type 'default'
[DEBUG] Found parameters for device type 'default'
[DEBUG] Searching database for kernel 'XgemvFastRot'
[DEBUG] Device type 'GPU'; vendor 'QUALCOMM'
[DEBUG] Device name 'QUALCOMM Adreno(TM) 750'; architecture 'OpenCL C 3.0 Adreno(TM) 750'
[DEBUG] Found architectures of vendor 'QUALCOMM' and type 'GPU'
[DEBUG] Found devices of architecture type 'default'
[DEBUG] Found parameters for device type 'default'
[DEBUG] Searching database for kernel 'TrsvRoutine'
[DEBUG] Device type 'GPU'; vendor 'QUALCOMM'
[DEBUG] Device name 'QUALCOMM Adreno(TM) 750'; architecture 'OpenCL C 3.0 Adreno(TM) 750'
[DEBUG] Found architectures of vendor 'default' and type 'default'
[DEBUG] Found devices of architecture type 'default'
[DEBUG] Found parameters for device type 'default'
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Compiling routine 'GEMV-16 (half)'
[DEBUG] Completed compilation in 125.75 ms
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 14.59 ms
No. 0 GEMV execution time: 149.68 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.91 ms
No. 1 GEMV execution time: 4.99323 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.62 ms
No. 2 GEMV execution time: 4.75443 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.75 ms
No. 3 GEMV execution time: 4.83063 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.66 ms
No. 4 GEMV execution time: 4.86604 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.80 ms
No. 5 GEMV execution time: 4.88469 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.69 ms
No. 6 GEMV execution time: 4.83239 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 5.17 ms
No. 7 GEMV execution time: 5.66073 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.88 ms
No. 8 GEMV execution time: 5.12714 ms
[DEBUG] GEMV_Xgemv_64_1_XgemvFast_4_32_4_XgemvFastRot_8_16_8_TrsvRoutine_24
[DEBUG] Running kernel 'XgemvFast'
[DEBUG] Completed kernel in 4.86 ms
No. 9 GEMV execution time: 5.03026 ms
GEMV execution time: 19.4659 ms
Result : 4096 4096 4096 4096 4096 4096 4096 4096 4096 4096
@CNugteren
Copy link
Owner

CNugteren commented Oct 15, 2024

Thanks for sharing the complete test results here. I looked at your code briefly but I don't see anything obviously wrong. A few things to try:

  1. Add a clFinish(queue) after your clEnqueueReadBuffer(queue, ...) calls.
  2. Try with smaller input sizes.
  3. Try with different values.

But perhaps a better thing is to try to run the CLBlast tests themselves. Run CMake with -DTESTS=ON (make sure you have a reference BLAS installed for comparison, e.g. OpenBLAS or MKL) and then run the appropriate test, e.g.:

cmake -S . -B build -DTESTS=ON -DTUNERS=ON -DCLIENTS=ON
cmake --build build
./build/clblast_test_xgemv

For example on my test device the output would look like this (for the HGEMV part):

* Running on OpenCL device 'Intel(R) UHD Graphics 620 [0x3ea0]'.
* Starting tests for the 'HGEMV' routine. Legend:
   : -> Test produced correct results
   . -> Test returned the correct error code
   X -> Test produced incorrect results
   / -> Test returned an incorrect error code
   \ -> Test not executed: OpenCL-kernel compilation error
   o -> Test not executed: Unsupported precision
   - -> Test not completed: Reference CBLAS doesn't output error codes
* Testing with error margins of 8.0% (relative) and 0.150 (absolute)
* and a combined maximum allowed L2 error of 5.00e-02
* Testing 'regular behaviour' for '101 (row-major) 111 (regular)':
   ::::::::::::::::::-:-:-:-:-:-:-:-:-:::::::::::::::::::-:-:-:-:-:
   -:-:-:-:
   Pass rate  75.0%: 54 passed / 18 skipped / 0 failed
* Testing 'regular behaviour' for '101 (row-major) 112 (transposed)':
   ::::::::::::::::::-:-:-:-:-:-:-:-:-:::::::::::::::::::-:-:-:-:-:
   -:-:-:-:
   Pass rate  75.0%: 54 passed / 18 skipped / 0 failed
* Testing 'regular behaviour' for '102 (col-major) 111 (regular)':
   ::::::::::::::::::::::::::::::::::::-:-:-:-:-:-:-:-:-:-:-:-:-:-:
   -:-:-:-:
   Pass rate  75.0%: 54 passed / 18 skipped / 0 failed
* Testing 'regular behaviour' for '102 (col-major) 112 (transposed)':
   ::::::::::::::::::::::::::::::::::::-:-:-:-:-:-:-:-:-:-:-:-:-:-:
   -:-:-:-:
   Pass rate  75.0%: 54 passed / 18 skipped / 0 failed
* Completed all test-cases for this routine. Results:
   216 test(s) passed
   72 test(s) skipped
   0 test(s) failed

So no failures here. If you run that, at least we know if the problem is in your test code or in the combination of CLBlast with your device.

BTW, you can also test speed with the CLBlast 'clients':

./build/clblast_client_xgemv -m 16384 -n 2048 --precision 16

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants