From 03ff63a5800b144201cf390c0fab4bb817e1df81 Mon Sep 17 00:00:00 2001 From: Zhihua Dong Date: Fri, 17 Jun 2022 16:30:29 -0400 Subject: [PATCH 1/5] change rand kernel to fix wg size. also S*R kernle --- inc/WireCellGenSycl/SyclArray.h | 191 ++++++++++++++---------------- src/BinnedDiffusion_transform.cxx | 16 ++- src/ImpactTransform.cxx | 23 ++-- test/test_syclarray.cpp | 10 +- 4 files changed, 120 insertions(+), 120 deletions(-) diff --git a/inc/WireCellGenSycl/SyclArray.h b/inc/WireCellGenSycl/SyclArray.h index fa1b654..5ebe544 100644 --- a/inc/WireCellGenSycl/SyclArray.h +++ b/inc/WireCellGenSycl/SyclArray.h @@ -22,74 +22,102 @@ namespace WireCell { template < class T > class Array1D { + struct A1d_s { + T* ptr; + size_t sz ; + } ; public : Array1D() { - q_ = GenSycl::SyclEnv::get_queue() ; - ptr_ = NULL ; - sz_ =0 ; + // q_ = GenSycl::SyclEnv::get_queue() ; + a_.ptr = NULL ; + a_.sz =0 ; } //~Array1D( ) { sycl::free( ptr_, q_ ) ; } ; ~Array1D( ) { } ; Array1D( T* ptr, size_t N ) { - ptr_ = ptr ; - sz_ = N ; - + a_.ptr = ptr ; + a_.sz = N ; + } + Array1D( struct A1d_s a ) { + a_.ptr = a.ptr ; + a_.sz = a.sz ; + } + auto a() { + return a_ ; } - void reset(){ sycl::free(ptr_, q_) ; ptr_=NULL ; sz_=0 ;}; + void reset(){ + auto q = GenSycl::SyclEnv::get_queue(); + sycl::free(a_.ptr, q) ; + a_.ptr=NULL ; + a_.sz=0 ; + }; Array1D ( size_t N , bool init = true ) { - q_ = GenSycl::SyclEnv::get_queue() ; - ptr_ = sycl::malloc_device ( N, q_ ) ; + auto q = GenSycl::SyclEnv::get_queue() ; + a_.ptr = sycl::malloc_device ( N, q ) ; if(init) { - q_.memset(ptr_, 0 , N*sizeof(T) ) ; + q.memset(a_.ptr, 0 , N*sizeof(T) ) ; } - sz_ = N ; + a_.sz = N ; } - T & operator[] (size_t i) const { return ptr_[i] ; }; - T & operator[] (size_t i) { return ptr_[i] ; }; - T &operator()(size_t i)const { return ptr_[i] ; }; - T &operator()(size_t i) { return ptr_[i] ; }; + T & operator[] (size_t i) const { return a_.ptr[i] ; }; + T & operator[] (size_t i) { return a_.ptr[i] ; }; + T &operator()(size_t i)const { return a_.ptr[i] ; }; + T &operator()(size_t i) { return a_.ptr[i] ; }; void set( T value ) { - auto ptr = ptr_ ; - q_.parallel_for( sz_ ,[=] (auto i ) { + auto ptr = a_.ptr ; + auto size = a_.sz ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.parallel_for( size ,[=] (auto i ) { ptr[i] = value ; }).wait() ; } void copy_to( T* d_ptr) { - q_.memcpy(d_ptr , ptr_ , sz_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy(d_ptr , a_.ptr , a_.sz* sizeof(T) ).wait() ; } void copy_from( T* s_ptr) { - q_.memcpy( (void * )ptr_ ,(void * ) s_ptr, sz_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy( (void * )a_.ptr ,(void * ) s_ptr, a_.sz* sizeof(T) ).wait() ; } void copy_from( void * s_ptr) { - q_.memcpy( ptr_ , s_ptr, sz_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy( a_.ptr, s_ptr, a_.sz* sizeof(T) ).wait() ; } T* to_host() { - T* ret = (T*) malloc( sizeof(T) * sz_) ; - q_.memcpy( (void *)ret, (void *)ptr_, sz_* sizeof(T) ).wait() ; + T* ret = (T*) malloc( sizeof(T) * a_.sz) ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy( (void *)ret, (void *)a_.ptr, a_.sz* sizeof(T) ).wait() ; return ret ; } size_t extent( int i ) const { if (i != 0 ){ std::cout<< "Array1D invalid dimmension: "<< i << std::endl ; exit(1) ; - } else return sz_ ; + } else return a_.sz ; } - T * data() const { return ptr_ ; } + T * data() const { return a_.ptr ; } void resize( size_t i ) { - T* ptr1 = sycl::malloc_device ( i , q_) ; - size_t j = sz_ > i ? i : sz_ ; - q_memcpy(ptr1, ptr_ , j * sizeof (T) ).wait() ; - if ( j > sz_ ) q_memset(ptr1+sz_, 0 , (j-sz_) * sizeof(T) ) ; - sycl::free (ptr_, q_) ; - ptr_ = ptr1 ; - sz_ = i ; + auto q = GenSycl::SyclEnv::get_queue() ; + T* ptr1 = sycl::malloc_device ( i , q) ; + size_t j = a_.sz > i ? i : a_.sz ; + q.memcpy(ptr1, a_.ptr , j * sizeof (T) ).wait() ; + if ( j > a_.sz ) q_memset(ptr1+a_.sz, 0 , (j-a_.sz) * sizeof(T) ) ; + sycl::free (a_.ptr, q) ; + a_.ptr = ptr1 ; + a_.sz_ = i ; } - void free() { sycl::free(ptr_,q_) ; } - auto get_queue() {return q_ ; } + void free() { + auto q = GenSycl::SyclEnv::get_queue() ; + sycl::free(a_.ptr,q) ; + } + auto get_queue() { + auto q = GenSycl::SyclEnv::get_queue() ; + return q ; + } + + private : - T* ptr_ ; - size_t sz_ ; - cl::sycl::queue q_ ; + struct A1d_s a_ ; } ; /// A 2D array @@ -97,7 +125,6 @@ namespace WireCell { { public : Array2D() { - q_ = GenSycl::SyclEnv::get_queue() ; ptr_ = NULL ; sz1_ =0 ; sz2_ =0 ; @@ -112,10 +139,10 @@ namespace WireCell { } Array2D ( size_t N , size_t M , bool init = true ) { - q_ = GenSycl::SyclEnv::get_queue() ; - ptr_ = sycl::malloc_device ( N*M, q_ ) ; + auto q = GenSycl::SyclEnv::get_queue() ; + ptr_ = sycl::malloc_device ( N*M, q ) ; if(init) { - q_.memset(ptr_, 0 , N*M*sizeof(T)) ; + q.memset(ptr_, 0 , N*M*sizeof(T)) ; } sz1_ = N ; sz2_ = M ; @@ -124,34 +151,39 @@ namespace WireCell { ptr_ = ar.data() ; sz1_ = ar.extent(0) ; sz2_ = ar.extent(1) ; - q_ = ar.get_queue() ; } void alloc(size_t N, size_t M) { - ptr_ = sycl::malloc_device ( N*M, q_ ) ; + auto q = GenSycl::SyclEnv::get_queue() ; + ptr_ = sycl::malloc_device ( N*M, q ) ; sz1_ = N ; sz2_ = M ; } T& operator()(size_t i, size_t j ) const { return ptr_[j*sz1_ + i ] ; }; void set( const T value ) { + auto q = GenSycl::SyclEnv::get_queue() ; auto ptr = ptr_ ; auto v = value ; - q_.parallel_for( sz1_*sz2_ ,[=] (auto i ) { + q.parallel_for( sz1_*sz2_ ,[=] (auto i ) { ptr[i] = v ; }).wait() ; } void copy_to( T* d_ptr) { - q_.memcpy(d_ptr , ptr_ , sz1_*sz2_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy(d_ptr , ptr_ , sz1_*sz2_* sizeof(T) ).wait() ; } void copy_from( T* s_ptr) { - q_.memcpy( ptr_ ,(void*) s_ptr, sz1_*sz2_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy( ptr_ ,(void*) s_ptr, sz1_*sz2_* sizeof(T) ).wait() ; } void copy_from( void* s_ptr) { - q_.memcpy( ptr_ , s_ptr, sz1_*sz2_* sizeof(T) ).wait() ; + auto q = GenSycl::SyclEnv::get_queue() ; + q.memcpy( ptr_ , s_ptr, sz1_*sz2_* sizeof(T) ).wait() ; } T* to_host() { + auto q = GenSycl::SyclEnv::get_queue() ; T* ret = (T*) malloc( sizeof(T) * sz1_*sz2_) ; - q_.memcpy( ret, ptr_, sz1_*sz2_ * sizeof(T) ).wait() ; + q.memcpy( ret, ptr_, sz1_*sz2_ * sizeof(T) ).wait() ; return ret ; } @@ -165,27 +197,33 @@ namespace WireCell { } T * data() const { return ptr_ ; } void resize( size_t i, size_t j, T value ) { - T* ptr1 = sycl::malloc_device ( i*j , q_) ; + auto q = GenSycl::SyclEnv::get_queue() ; + T* ptr1 = sycl::malloc_device ( i*j , q) ; auto sz1 = sz1_ ; auto sz2 = sz2_ ; auto ptr = ptr_ ; - q_.parallel_for( {i , j} , [=] ( auto item ) { + q.parallel_for( {i , j} , [=] ( auto item ) { auto ii = item.get_id(0) ; auto jj = item.get_id(1) ; ptr1[ii + jj * i ] = (ii < sz1 && jj class WComplex //try own with simple wc needed method only - { - public : - WComplex() { x_=0 ; y_=0 ; } - ~WComplex() {} ; - WComplex( T a , T b) { x_= a; y_= b; } - WComplex(const T a) {x_= a; y_ = 0.0 ; } - WComplex(const WComplex & a ) { - x_ = a.Real() ; - y_ = a.Imag() ; - } - T Real()const { return x_ ; } - T Imag()const { return y_ ; } - - - WComplex & operator+ ( const WComplex & a) { - WComplex ret(x_+ a.Real(), y_ + a.Imag() ) ; - return ret ; - } - void operator *= (const WComplex & a ) { - x_ = x_ * a.Real() - y_ * a.Imag() ; - y_ = x_ * a.Imag() + y_ * a.Real() ; - } - void operator /= (const T& a ) { - if(a == 0.0 ) { - printf( "divided by 0\n" ) ; - } else { - x_ /= a ; - y_ /= a ; - } - } - WComplex & operator- ( const WComplex & a ) { - WComplex ret(x_- a.Real(), y_ - a.Imag() ) ; - return ret ; - } - - void operator = ( const WComplex & a ) { - x_ = a.Real() ; - y_ = a.Imag() ; - } - - -// private : - T x_ ; - T y_ ; - }; - typedef Array1D array_xf; /// A complex, 1D array diff --git a/src/BinnedDiffusion_transform.cxx b/src/BinnedDiffusion_transform.cxx index 7b4e912..afd5e2a 100644 --- a/src/BinnedDiffusion_transform.cxx +++ b/src/BinnedDiffusion_transform.cxx @@ -253,7 +253,10 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array auto patch_d_ptr = patch_d.data() ; // make a 1Darray pointing to random numbers unsigned long size = result[0] ; + wg_size = 32 ; if( size % 2 ) size++ ; + n_wg = (size/2 + wg_size -1 )/wg_size ; + size = n_wg*wg_size*2 ; SyclArray::Array1D normals(size, false) ; auto normals_ptr = normals.data() ; // temp space to hold uniform rn @@ -264,10 +267,15 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array oneapi::mkl::rng::uniform distr(0.0, 1.0); oneapi::mkl::rng::generate(distr, engine, size, temp_ur); //box muller approx to normal distribution - q.parallel_for(size/2 , [=](auto i) { + //q.parallel_for(size/2 , [=](auto i) { + // normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); + // normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); + // } ).wait() ; + q.parallel_for(sycl::nd_range<1>(size/2, wg_size) , [=](sycl::nd_item<1> item) { + auto i = item.get_global_linear_id() ; normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); - } ) ; + } ).wait() ; sycl::free(temp_ur, q) ; t1 = omp_get_wtime() ; @@ -351,6 +359,7 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array set_sampling_bat( npatches, nt_d, np_d, patch_idx, pvecs_d , tvecs_d, patch_d, normals,gdata ) ; // std::cout << "patch_d: " << typeid(patch_d).name() << std::endl; // std::cout << "patch_idx: " << typeid(patch_idx).name() << std::endl; + q.wait() ; wend = omp_get_wtime(); g_get_charge_vec_time_part4 += wend-wstart ; std::cout<<"SetSample: SetSamplingBatch :"<(end_pitch - start_pitch, m_end_tick - m_start_tick);; m_bd.get_charge_matrix_sycl(f_data, m_vec_impact, start_pitch, m_start_tick); -// std::cout << SyclArray::dump_2d_view(f_data, 10); double wend = omp_get_wtime(); td1 += wend-wstart ; g_get_charge_vec_time += wend - wstart; - //log->debug("ImpactTransform::ImpactTransform() : get_charge_vec() Total running time : {}", g_get_charge_vec_time); log->debug("ImpactTransform::ImpactTransform() : get_charge_matrix() Total_Time : {}", g_get_charge_vec_time); wstart = omp_get_wtime(); - //SyclArray::array_xxc acc_data_f_w = SyclArray::Zero(end_ch - start_ch + 2 * npad_wire, m_end_tick - m_start_tick); SyclArray::array_xxc acc_data_f_w(end_ch - start_ch + 2 * npad_wire, m_end_tick - m_start_tick); SyclArray::array_xxf acc_data_t_w = SyclArray::Zero(end_ch - start_ch + 2 * npad_wire, m_end_tick - m_start_tick); log->info("yuhw: pitch {} {} {} {}",start_pitch, end_pitch, f_data.extent(0), f_data.extent(1)); @@ -393,8 +390,6 @@ bool GenSycl::ImpactTransform::transform_matrix() auto ip = m_pir->closest(impact); Waveform::compseq_t sp_f = ip->spectrum(); - - //memcpy((void*) &sps_h(0, jimp) , (void*) &sp_f[0] , sp_size*2*sizeof(float) ) ; memcpy((void*) &sps_h[jimp * sp_size] , (void*) &sp_f[0] , sp_size*sizeof(std::complex) ) ; // 0 and right on the left; left on the right @@ -432,22 +427,22 @@ bool GenSycl::ImpactTransform::transform_matrix() auto data_d = SyclArray::dft_rc(f_data, 0); auto data_d_ptr = data_d.data() ; f_data.free() ; + q.wait(); SyclArray::array_xxc data_c(data_d.extent(0), data_d.extent(1) ) ; SyclArray::dft_cc(data_d, data_c, 1); auto data_c_ptr = data_c.data() ; + q.wait(); SyclArray::dft_cc(resp_f_w_k, data_d, 1); resp_f_w_k.free() ; - // S(f) * R(f) - q.parallel_for( {data_c.extent(0), data_c.extent(1)} , [=] (auto item ) { - auto i0 = item.get_id(0) ; - auto i1 = item.get_id(1) ; - auto d0 = item.get_range(0) ; - float a = data_c_ptr[i0 + i1*d0].x * data_d_ptr[i0 + i1*d0].x - data_c_ptr[i0 + i1*d0].y * data_d_ptr[i0 + i1*d0].y ; - float b = data_c_ptr[i0 + i1*d0].x * data_d_ptr[i0 + i1*d0].y + data_c_ptr[i0 + i1*d0].y * data_d_ptr[i0 + i1*d0].x ; - data_c_ptr[i0 + i1*d0].x = a ; - data_c_ptr[i0 + i1*d0].y = b ; + + // S(f) * R(f) + q.parallel_for( data_c.extent(0)*data_c.extent(1) , [=] (auto i0 ) { + float a = data_c_ptr[i0].x * data_d_ptr[i0].x - data_c_ptr[i0].y * data_d_ptr[i0].y ; + float b = data_c_ptr[i0].x * data_d_ptr[i0].y + data_c_ptr[i0].y * data_d_ptr[i0].x ; + data_c_ptr[i0].x = a ; + data_c_ptr[i0].y = b ; }).wait(); SyclArray::idft_cc(data_c,data_d, 1); diff --git a/test/test_syclarray.cpp b/test/test_syclarray.cpp index 6e4f553..80108e2 100644 --- a/test/test_syclarray.cpp +++ b/test/test_syclarray.cpp @@ -18,11 +18,15 @@ int main () { a_d.copy_from( a_host ) ; //q.memcpy(a_d.data() , a_host, N*sizeof(float) ).wait() ; - auto a_d_ptr = a_d.data() ; + //auto a_d_ptr = a_d.data() ; + auto a_s = a_d.a() ; q.parallel_for(N, [=](auto i) { int ii = i.get_id() ; - if(ii < 10 ) printf( " a_d[%d]=%f \n" , ii, a_d_ptr[ii] ) ; - a_d_ptr[ii] += float(ii*ii) ; + WireCell::SyclArray::array_xf a_d_k(a_s) ; + if(ii < 10 ) printf( " a_d[%d]=%f \n" , ii, a_d_k[ii] ) ; + a_d_k(ii) += float(ii*ii) ; + // if(ii < 10 ) printf( " a_d[%d]=%f \n" , ii, a_d[ii] ) ; + //a_d[ii] += float(ii*ii) ; } ).wait() ; auto rst = a_d.to_host() ; //float * rst = (float * ) malloc (sizeof(float) * N) ; From 22c7fdbf6d1631605740a485a5d6127a6d3b8eae Mon Sep 17 00:00:00 2001 From: Zhihua Dong Date: Fri, 1 Jul 2022 13:19:34 -0400 Subject: [PATCH 2/5] Using OMPRNG https://github.com/GKNB/test-benchmark-OpenMP-RNG --- .cmake-sycl/CMakeLists.txt | 7 +++-- inc/WireCellGenSycl/syclcommon.h | 2 +- src/BinnedDiffusion_transform.cxx | 46 +++++++++++++++++++++---------- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/.cmake-sycl/CMakeLists.txt b/.cmake-sycl/CMakeLists.txt index 985d190..83184b9 100644 --- a/.cmake-sycl/CMakeLists.txt +++ b/.cmake-sycl/CMakeLists.txt @@ -26,6 +26,7 @@ add_library(WireCellGenSycl SHARED ${all_files}) target_include_directories(WireCellGenSycl PRIVATE ${PROJECT_SOURCE_DIR}/../inc + $ENV{OMPRNG} $ENV{EIGEN_INC} $ENV{JSONCPP_INC} $ENV{JSONNET_INC} @@ -33,9 +34,9 @@ target_include_directories(WireCellGenSycl $ENV{WIRECELL_INC} ) set_target_properties(WireCellGenSycl - PROPERTIES COMPILE_OPTIONS "-DEIGEN_NO_CUDA;-DEIGEN_DONT_VECTORIZE") + PROPERTIES COMPILE_OPTIONS "-DEIGEN_NO_CUDA;-DEIGEN_DONT_VECTORIZE;-DSYCL_TARGET_CUDA") -target_link_directories(WireCellGenSycl PRIVATE $ENV{JSONCPP_LIB} $ENV{WIRECELL_LIB} $ENV{ONEMKL_LIB}) -target_link_libraries(WireCellGenSycl PRIVATE jsoncpp WireCellIface WireCellUtil onemkl Boost::headers ${CUDA_CUFFT_LIBRARIES}) +target_link_directories(WireCellGenSycl PRIVATE $ENV{JSONCPP_LIB} $ENV{WIRECELL_LIB} }) +target_link_libraries(WireCellGenSycl PRIVATE jsoncpp WireCellIface WireCellUtil Boost::headers ${CUDA_CUFFT_LIBRARIES} ${CUDA_curand_LIBRARY}) add_subdirectory(test) diff --git a/inc/WireCellGenSycl/syclcommon.h b/inc/WireCellGenSycl/syclcommon.h index 0ee8ccb..d041cf3 100644 --- a/inc/WireCellGenSycl/syclcommon.h +++ b/inc/WireCellGenSycl/syclcommon.h @@ -2,7 +2,7 @@ #define GENSYCL_SYCLCOMMON_H_ #include -#define SYCL_TARGET_CUDA +//#define SYCL_TARGET_CUDA namespace WireCell::GenSycl::syclcommon { diff --git a/src/BinnedDiffusion_transform.cxx b/src/BinnedDiffusion_transform.cxx index afd5e2a..cb054ab 100644 --- a/src/BinnedDiffusion_transform.cxx +++ b/src/BinnedDiffusion_transform.cxx @@ -9,7 +9,17 @@ #include #include "WireCellGenSycl/SyclEnv.h" -#include "oneapi/mkl/rng.hpp" +//#include "oneapi/mkl/rng.hpp" + +#if defined SYCL_TARGET_CUDA +#define ARCH_CUDA +#elif defined SYCL_TARGET_HIP +#define ARCH_HIP +#else +#error "need define SYCL_TARGET_CUDA or SYCL_TARGET_HIP" +#endif + +#include "openmp_rng.h" #define MAX_PATCH_SIZE 512 #define P_BLOCK_SIZE 512 @@ -259,9 +269,13 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array size = n_wg*wg_size*2 ; SyclArray::Array1D normals(size, false) ; auto normals_ptr = normals.data() ; - // temp space to hold uniform rn - auto temp_ur = sycl::malloc_device(size, q) ; + //// temp space to hold uniform rn + //auto temp_ur = sycl::malloc_device(size, q) ; uint64_t seed = 2020; + + generator_enum gen_type = generator_enum::philox; + omp_get_rng_normal_double(normals_ptr, n_wg*wg_size, 0.0f, 1.0f, seed, gen_type); + /* oneapi::mkl::rng::philox4x32x10 engine(q, seed); //generate Uniform distribution oneapi::mkl::rng::uniform distr(0.0, 1.0); @@ -276,7 +290,9 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); } ).wait() ; - sycl::free(temp_ur, q) ; + + */ + //sycl::free(temp_ur, q) ; t1 = omp_get_wtime() ; std::cout<<"SetSample: rand :"< normals(size, false) ; auto normals_ptr = normals.data() ; // temp space to hold uniform rn - auto temp_ur = sycl::malloc_device(size, q) ; + //auto temp_ur = sycl::malloc_device(size, q) ; uint64_t seed = 2020; - oneapi::mkl::rng::philox4x32x10 engine(q, seed); + generator_enum gen_type = generator_enum::philox; + omp_get_rng_normal_double(normals_ptr, size, 0.0f, 1.0f, seed, gen_type); + //oneapi::mkl::rng::philox4x32x10 engine(q, seed); //generate Uniform distribution - oneapi::mkl::rng::uniform distr(0.0, 1.0); - oneapi::mkl::rng::generate(distr, engine, size, temp_ur); - //box muller approx to normal distribution - q.parallel_for(size/2 , [=](auto i) { - normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); - normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); - } ) ; - sycl::free(temp_ur, q) ; + // oneapi::mkl::rng::uniform distr(0.0, 1.0); + // oneapi::mkl::rng::generate(distr, engine, size, temp_ur); + // //box muller approx to normal distribution + // q.parallel_for(size/2 , [=](auto i) { + // normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); + // normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); + // } ) ; + // sycl::free(temp_ur, q) ; // decide weight calculation int weightstrat = m_calcstrat; From a073ea03270e80372c1f0e14ce33391ad358a3d3 Mon Sep 17 00:00:00 2001 From: Zhihua Dong Date: Thu, 14 Jul 2022 10:09:57 -0400 Subject: [PATCH 3/5] add amd cmake files --- .cmake-sycl-amd/CMakeLists.txt | 44 ++ .cmake-sycl-amd/test/CMakeLists.txt | 21 + inc/WireCellGenSycl/SyclArray_fftw.h | 99 +++++ inc/WireCellGenSycl/SyclArray_hip.h | 611 +++++++++++++++++++++++++++ 4 files changed, 775 insertions(+) create mode 100644 .cmake-sycl-amd/CMakeLists.txt create mode 100644 .cmake-sycl-amd/test/CMakeLists.txt create mode 100644 inc/WireCellGenSycl/SyclArray_fftw.h create mode 100644 inc/WireCellGenSycl/SyclArray_hip.h diff --git a/.cmake-sycl-amd/CMakeLists.txt b/.cmake-sycl-amd/CMakeLists.txt new file mode 100644 index 0000000..6882b38 --- /dev/null +++ b/.cmake-sycl-amd/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.1.14 FATAL_ERROR) +project(wire-cell-gen-sycl CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS Off) + + +file(GLOB all_files ${PROJECT_SOURCE_DIR}/../src/*.cxx) + + + +set(BOOST_ROOT $ENV{BOOST_DIR} ) +#set(BOOST_INCLUDEDIR $ENV{BOOST_INC}) +#set(BOOST_LIBRARYDIR $ENV{BOOST_LIB}) + +find_package(Boost REQUIRED COMPONENTS) +include_directories(Boost_INCLUDE_DIRS}) +set(Boost_USE_MULTITHREADED ON) + +find_package(hipSYCL REQUIRED) + +string(APPEND CMAKE_CXX_FLAGS "-g -O3 -fopenmp -pedantic -Wall ") +string(APPEND CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined") + +add_library(WireCellGenSycl SHARED ${all_files}) +add_sycl_to_target(TARGET WireCellGenSycl SOURCES ${all_files}) + + +target_include_directories(WireCellGenSycl + PRIVATE + ${PROJECT_SOURCE_DIR}/../inc + $ENV{HIPFFT_INC} + $ENV{EIGEN_INC} + $ENV{JSONCPP_INC} + $ENV{JSONNET_INC} + $ENV{SPDLOG_INC} + $ENV{WIRECELL_INC} +) +set_target_properties(WireCellGenSycl + PROPERTIES COMPILE_OPTIONS "-DEIGEN_NO_CUDA;-DEIGEN_DONT_VECTORIZE;-DSYCL_TARGET_HIP") +target_link_directories(WireCellGenSycl PRIVATE $ENV{JSONCPP_LIB} $ENV{WIRECELL_LIB} $ENV{ONEMKL_LIB} $ENV{HIPFFT_LIB}) +target_link_libraries(WireCellGenSycl PRIVATE jsoncpp WireCellIface WireCellUtil onemkl Boost::headers hipfft) + +add_subdirectory(test) diff --git a/.cmake-sycl-amd/test/CMakeLists.txt b/.cmake-sycl-amd/test/CMakeLists.txt new file mode 100644 index 0000000..ebda7e3 --- /dev/null +++ b/.cmake-sycl-amd/test/CMakeLists.txt @@ -0,0 +1,21 @@ +set(all_execs + test_syclarray.cpp +) + +set(source_dir ${PROJECT_SOURCE_DIR}/../test) + +foreach(source_file IN LISTS all_execs) + get_filename_component(exec_name ${source_file} NAME_WE) + add_executable(${exec_name} ${source_dir}/${source_file}) + add_sycl_to_target(TARGET ${exec_name} SOURCES ${source_file} ) + target_link_directories(${exec_name} PRIVATE $ENV{WIRECELL_LIB}) + target_link_libraries(${exec_name} PRIVATE WireCellGenSycl) + target_include_directories(${exec_name} + PRIVATE + ${PROJECT_SOURCE_DIR}/../inc + $ENV{WIRECELL_INC} + ) +set_target_properties(${exec_name} + PROPERTIES COMPILE_OPTIONS "-DSYCL_TARGET_HIP") + +endforeach() diff --git a/inc/WireCellGenSycl/SyclArray_fftw.h b/inc/WireCellGenSycl/SyclArray_fftw.h new file mode 100644 index 0000000..8f214f5 --- /dev/null +++ b/inc/WireCellGenSycl/SyclArray_fftw.h @@ -0,0 +1,99 @@ +/** + * Wrappers for FFTW based FFT + */ + +#ifndef WIRECELL_SYCLARRAY_FFTW +#define WIRECELL_KOKKOSARRAY_FFTW + +#include // tmp solution +#include //debug + +namespace WireCell { + + namespace KokkosArray { + /** + * using eigen based WireCell::Array for fast prototyping + * this may introduce extra data copying, will investigate later + * by default eigen is left layout (column major) + * https://eigen.tuxfamily.org/dox/group__TopicStorageOrders.html + * FIXME: float should be Scalar + */ + thread_local static Eigen::FFT gEigenFFT; + inline array_xc dft(const array_xf& in) + { + Eigen::Map in_eigen((float*) in.data(), in.extent(0)); + Eigen::VectorXcf out_eigen = gEigenFFT.fwd(in_eigen); + array_xc out(out_eigen.size(),false) ; + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.size()*sizeof(float) * 2); + return out; + } + inline array_xf idft(const array_xc& in) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0)); + Eigen::VectorXf out_eigen = gEigenFFT.inv(in_eigen); + array_xf out(out_eigen.size(),false) ; + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.size()*sizeof(float)); + return out; + } + inline array_xxc dft_rc(const array_xxf& in, int dim = 0) + { + std::cout << "WIRECELL_KOKKOSARRAY_FFTW" << std::endl; + Eigen::Map in_eigen((float*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::dft_rc(in_eigen, dim); + auto out = gen_2d_view(out_eigen.rows(), out_eigen.cols(), 0); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + return out; + } + inline array_xxc dft_cc(const array_xxc& in, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::dft_cc(in_eigen, dim); + auto out = Zero(out_eigen.rows(), out_eigen.cols()); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + return out; + } + inline void dft_cc(const array_xxc& in, array_xxc& out, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::dft_cc(in_eigen, dim); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + } + inline array_xxc idft_cc(const array_xxc& in, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::idft_cc(in_eigen, dim); + auto out = Zero(out_eigen.rows(), out_eigen.cols()); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + return out; + } + inline void idft_cc(const array_xxc& in, array_xxc& out, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::idft_cc(in_eigen, dim); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + } + inline array_xxf idft_cr(const array_xxc& in, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::idft_cr(in_eigen, dim); + auto out = Zero(out_eigen.rows(), out_eigen.cols()); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar)); + + return out; + } + inline void idft_cr(const array_xxc& in, array_xxf& out, int dim = 0) + { + Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::idft_cr(in_eigen, dim); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar)); + } + + } // namespace KokkosArray +} // namespace WireCell + +#endif diff --git a/inc/WireCellGenSycl/SyclArray_hip.h b/inc/WireCellGenSycl/SyclArray_hip.h new file mode 100644 index 0000000..f140760 --- /dev/null +++ b/inc/WireCellGenSycl/SyclArray_hip.h @@ -0,0 +1,611 @@ +/** + * Wrappers for cuFFT based FFT + */ + +#ifndef WIRECELL_SYCLARRAY_CUDA +#define WIRECELL_SYCLARRAY_CUDA + +#include +#include + +namespace WireCell { + + namespace SyclArray { + inline array_xc dft(const array_xf& in) + { + Index N0 = in.extent(0); + //auto out = gen_1d_Array(N0, 0); + //auto out = array_xc(N0, false); + auto out = array_xc(N0 ); + hipfftHandle plan = NULL; + hipfftResult status ; + status = hipfftPlan1d( &plan, N0, HIPFFT_R2C, 1) ; + assert(status == HIPFFT_SUCCESS) ; + status = hipfftExecR2C( plan, (float * )in.data(), (float2 *) out.data() ) ; + assert(status == HIPFFT_SUCCESS) ; + hipfftDestroy(plan) ; + return out; + } + inline array_xf idft(const array_xc& in) + { + Index N0 = in.extent(0); + auto out = array_xf(N0); + hipfftHandle plan = NULL; + hipfftResult status ; + status = hipfftPlan1d( &plan, N0, HIPFFT_C2R, 1) ; + assert(status == HIPFFT_SUCCESS) ; + status = hipfftExecC2R( plan, (float2 * )in.data(), (float *) out.data() ) ; + assert(status == HIPFFT_SUCCESS) ; + hipfftDestroy(plan) ; + auto q = out.get_queue() ; + auto ptr = out.data() ; + q.parallel_for(sycl::range<1> (N0), [=] ( auto ii0) { auto i0 = ii0.get_id(0) ; ptr[i0] /= N0 ; } ).wait() ; + return out; + } + inline array_xxc dft_rc(const array_xxf& in, int dim = 0) + { + std::cout << "WIRECELL_SYCLARRAY_HIP" << std::endl; + Index N0 = in.extent(0); + Index N1 = in.extent(1); + auto out = array_xxc(N0, N1); + //std::cout<<"(N0,N1)="< +#define __CUDA_ARCH__ +#else +#include +#endif + #include "WireCellUtil/Pimpos.h" #include "WireCellUtil/Point.h" #include "WireCellUtil/Units.h" diff --git a/inc/WireCellGenSycl/DepoTransform.h b/inc/WireCellGenSycl/DepoTransform.h index 3387dcb..cdfee5c 100644 --- a/inc/WireCellGenSycl/DepoTransform.h +++ b/inc/WireCellGenSycl/DepoTransform.h @@ -4,6 +4,11 @@ #ifndef WIRECELL_GENSYCL_DEPOTRANSFORM #define WIRECELL_GENSYCL_DEPOTRANSFORM +#ifdef __CUDA_ARCH__ +#undef __CUDA_ARCH__ +#define HAVE_CUDA +#endif + #include "WireCellIface/IDepoFramer.h" #include "WireCellIface/IConfigurable.h" #include "WireCellIface/IRandom.h" @@ -13,6 +18,11 @@ #include "WireCellIface/IDepo.h" #include "WireCellUtil/Logging.h" +#ifdef HAVE_CUDA +#undef HAVE_CUDA +#define __CUDA_ARCH__ +#endif + #include namespace WireCell { diff --git a/inc/WireCellGenSycl/GaussianDiffusion.h b/inc/WireCellGenSycl/GaussianDiffusion.h index 32206c6..fe76822 100644 --- a/inc/WireCellGenSycl/GaussianDiffusion.h +++ b/inc/WireCellGenSycl/GaussianDiffusion.h @@ -1,11 +1,25 @@ #ifndef WIRECELL_GENSYCL_GAUSSIANDIFFUSION #define WIRECELL_GENSYCL_GAUSSIANDIFFUSION + +#ifdef __CUDA_ARCH__ +#define HAVE_CUDA +#undef __CUDA_ARCH__ +#endif + + #include "WireCellUtil/Array.h" #include "WireCellUtil/Binning.h" #include "WireCellIface/IDepo.h" #include "WireCellIface/IRandom.h" + +#ifdef HAVE_CUDA +#define __CUDA_ARCH_ +#undef HAVE_CUDA +#endif + + #include #include diff --git a/inc/WireCellGenSycl/ImpactData.h b/inc/WireCellGenSycl/ImpactData.h index 14ec04e..de8b7b0 100644 --- a/inc/WireCellGenSycl/ImpactData.h +++ b/inc/WireCellGenSycl/ImpactData.h @@ -3,9 +3,21 @@ point in space where a field response function begins. */ +#ifdef __CUDA_ARCH__ +#undef __CUDA_ARCH__ +#define HAVE_CUDA +#endif + #include "WireCellUtil/Waveform.h" + +#ifdef HAVE_CUDA +#undef HAVE_CUDA +#define __CUDA_ARCH__ +#endif + #include "WireCellGenSycl/GaussianDiffusion.h" + #include #include diff --git a/inc/WireCellGenSycl/ImpactTransform.h b/inc/WireCellGenSycl/ImpactTransform.h index 21b37e0..a6503eb 100644 --- a/inc/WireCellGenSycl/ImpactTransform.h +++ b/inc/WireCellGenSycl/ImpactTransform.h @@ -1,13 +1,25 @@ #ifndef WIRECELL_GENSYCL_IMPACTTRANSFORM #define WIRECELL_GENSYCL_IMPACTTRANSFORM +#ifdef __CUDA_ARCH__ +#define __CUDA_PASS__ +#undef __CUDA_ARCH__ +#endif + #include "WireCellIface/IPlaneImpactResponse.h" -#include "WireCellGenSycl/BinnedDiffusion_transform.h" #include "WireCellUtil/Array.h" #include "WireCellUtil/Logging.h" - #include +#ifdef __CUDA_PASS__ +#undef __cuda_PASS__ +#define __CUDA_ARCH__ +#endif + + + +#include "WireCellGenSycl/BinnedDiffusion_transform.h" + namespace WireCell { namespace GenSycl { diff --git a/inc/WireCellGenSycl/SyclArray.h b/inc/WireCellGenSycl/SyclArray.h index 5ebe544..771d399 100644 --- a/inc/WireCellGenSycl/SyclArray.h +++ b/inc/WireCellGenSycl/SyclArray.h @@ -418,19 +418,12 @@ namespace WireCell { } // namespace KokkosArray } // namespace WireCell -/* -#if defined ENABLE_CUDA - #include "WireCellGenKokkos/SyclArray_cuda.h" -#elif defined ENABLE_HIP - #include "WireCellGenKokkos/SyclArray_hip.h" +#if defined(SYCL_TARGET_CUDA) + #include "WireCellGenSycl/SyclArray_cuda.h" +#elif defined(SYCL_TARGET_HIP) + #include "WireCellGenSycl/SyclArray_hip.h" #else - #include "WireCellGenKokkos/SyclArray_fftw.h" + #include "WireCellGenSycl/SyclArray_fftw.h" #endif -} // namespace SyclArray -} // namespace WireCell - -*/ -#include "WireCellGenSycl/SyclArray_cuda.h" - #endif diff --git a/inc/WireCellGenSycl/SyclArray_fftw.h b/inc/WireCellGenSycl/SyclArray_fftw.h index 8f214f5..ef4db70 100644 --- a/inc/WireCellGenSycl/SyclArray_fftw.h +++ b/inc/WireCellGenSycl/SyclArray_fftw.h @@ -3,14 +3,14 @@ */ #ifndef WIRECELL_SYCLARRAY_FFTW -#define WIRECELL_KOKKOSARRAY_FFTW +#define WIRECELL_SYCLARRAY_FFTW #include // tmp solution #include //debug namespace WireCell { - namespace KokkosArray { + namespace SyclArray { /** * using eigen based WireCell::Array for fast prototyping * this may introduce extra data copying, will investigate later @@ -23,7 +23,7 @@ namespace WireCell { { Eigen::Map in_eigen((float*) in.data(), in.extent(0)); Eigen::VectorXcf out_eigen = gEigenFFT.fwd(in_eigen); - array_xc out(out_eigen.size(),false) ; + array_xc out(out_eigen.size()) ; memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.size()*sizeof(float) * 2); return out; } @@ -31,20 +31,31 @@ namespace WireCell { { Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0)); Eigen::VectorXf out_eigen = gEigenFFT.inv(in_eigen); - array_xf out(out_eigen.size(),false) ; + array_xf out(out_eigen.size()) ; memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.size()*sizeof(float)); return out; } inline array_xxc dft_rc(const array_xxf& in, int dim = 0) { - std::cout << "WIRECELL_KOKKOSARRAY_FFTW" << std::endl; + std::cout << "WIRECELL_SYCLARRAY_FFTW" << std::endl; Eigen::Map in_eigen((float*) in.data(), in.extent(0), in.extent(1)); auto out_eigen = WireCell::Array::dft_rc(in_eigen, dim); - auto out = gen_2d_view(out_eigen.rows(), out_eigen.cols(), 0); + //auto out = gen_2d_Array(out_eigen.rows(), out_eigen.cols(), 0); + auto out = array_xxc(out_eigen.rows(), out_eigen.cols()); memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); return out; } + inline void dft_rc(const array_xxf& in, array_xxc& out,int dim = 0) + { + std::cout << "WIRECELL_SYCLARRAY_FFTW" << std::endl; + Eigen::Map in_eigen((float*) in.data(), in.extent(0), in.extent(1)); + auto out_eigen = WireCell::Array::dft_rc(in_eigen, dim); + //auto out = gen_2d_Array(out_eigen.rows(), out_eigen.cols(), 0); + //auto out = array_xxc(out_eigen.rows(), out_eigen.cols()); + memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar) * 2); + + } inline array_xxc dft_cc(const array_xxc& in, int dim = 0) { Eigen::Map in_eigen((std::complex*) in.data(), in.extent(0), in.extent(1)); @@ -93,7 +104,7 @@ namespace WireCell { memcpy( (void*)out.data(), (void*)out_eigen.data(), out_eigen.rows()*out_eigen.cols()*sizeof(Scalar)); } - } // namespace KokkosArray + } // namespace SyclArray } // namespace WireCell #endif diff --git a/inc/WireCellGenSycl/SyclArray_hip.h b/inc/WireCellGenSycl/SyclArray_hip.h index f140760..c8af9af 100644 --- a/inc/WireCellGenSycl/SyclArray_hip.h +++ b/inc/WireCellGenSycl/SyclArray_hip.h @@ -2,10 +2,22 @@ * Wrappers for cuFFT based FFT */ -#ifndef WIRECELL_SYCLARRAY_CUDA -#define WIRECELL_SYCLARRAY_CUDA +#ifndef WIRECELL_SYCLARRAY_HIP +#define WIRECELL_SYCLARRAY_HIP + +#ifdef __CUDA_ARCH__ +#define __CUDA_PASS__ +#undef __CUDA_ARCH__ +#endif #include + +#ifdef __CUDA_PASS__ +#undef __CUDA_PASS__ +#define __CUDA_ARCH__ +#endif + + #include namespace WireCell { diff --git a/src/BinnedDiffusion_transform.cxx b/src/BinnedDiffusion_transform.cxx index cb054ab..b166463 100644 --- a/src/BinnedDiffusion_transform.cxx +++ b/src/BinnedDiffusion_transform.cxx @@ -11,30 +11,34 @@ #include "WireCellGenSycl/SyclEnv.h" //#include "oneapi/mkl/rng.hpp" + +//This part if need for OMP_RNG #if defined SYCL_TARGET_CUDA #define ARCH_CUDA #elif defined SYCL_TARGET_HIP #define ARCH_HIP #else -#error "need define SYCL_TARGET_CUDA or SYCL_TARGET_HIP" +#define USE_RANDOM123 #endif + + #include "openmp_rng.h" -#define MAX_PATCH_SIZE 512 -#define P_BLOCK_SIZE 512 -#define MAX_PATCHES 50000 -#define MAX_NPSS_DEVICE 1000 -#define MAX_NTSS_DEVICE 1000 -#define FULL_MASK 0xffffffff -#define RANDOM_BLOCK_SIZE (1024*1024) -#define RANDOM_BLOCK_NUM 512 +//#define MAX_PATCH_SIZE 512 +//#define P_BLOCK_SIZE 512 +//#define MAX_PATCHES 50000 +//#define MAX_NPSS_DEVICE 1000 +//#define MAX_NTSS_DEVICE 1000 +//#define FULL_MASK 0xffffffff +//#define RANDOM_BLOCK_SIZE (1024*1024) +//#define RANDOM_BLOCK_NUM 512 //#define MAX_RANDOM_LENGTH (RANDOM_BLOCK_NUM*RANDOM_BLOCK_SIZE) -#define MAX_RANDOM_LENGTH (MAX_PATCH_SIZE*MAX_PATCHES) +//#define MAX_RANDOM_LENGTH (MAX_PATCH_SIZE*MAX_PATCHES) #define PI 3.14159265358979323846 -#define MAX_P_SIZE 50 -#define MAX_T_SIZE 50 +#define MAX_P_SIZE 100 +#define MAX_T_SIZE 100 @@ -194,6 +198,7 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array db_vt tvecs_d(npatches * MAX_T_SIZE,false); db_vt qweights_d(npatches * MAX_P_SIZE,false); + t0 = omp_get_wtime() ; std::cout<<"SetSample: View creation:"< normals(size, false) ; + std::cout<<"rand number size: " << size << std::endl ; auto normals_ptr = normals.data() ; - //// temp space to hold uniform rn - //auto temp_ur = sycl::malloc_device(size, q) ; uint64_t seed = 2020; generator_enum gen_type = generator_enum::philox; - omp_get_rng_normal_double(normals_ptr, n_wg*wg_size, 0.0f, 1.0f, seed, gen_type); - /* - oneapi::mkl::rng::philox4x32x10 engine(q, seed); - //generate Uniform distribution - oneapi::mkl::rng::uniform distr(0.0, 1.0); - oneapi::mkl::rng::generate(distr, engine, size, temp_ur); - //box muller approx to normal distribution - //q.parallel_for(size/2 , [=](auto i) { - // normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); - // normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); - // } ).wait() ; - q.parallel_for(sycl::nd_range<1>(size/2, wg_size) , [=](sycl::nd_item<1> item) { - auto i = item.get_global_linear_id() ; - normals_ptr[2*i] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::cos(2*PI*temp_ur[i+1]); - normals_ptr[2*i + 1] = sqrt(-2*sycl::log(temp_ur[i])) * sycl::sin(2*PI*temp_ur[i+1]); - } ).wait() ; - - */ - //sycl::free(temp_ur, q) ; + omp_get_rng_normal_double(normals_ptr, size, 0.0f, 1.0f, seed, gen_type); t1 = omp_get_wtime() ; std::cout<<"SetSample: rand :"< g ) { - int ip = g.get_group_id() ; + //int ip = g.get_group_id() ; + int ip = g.get_id() ; const double sqrt2 = sqrt(2.0); int np = np_d_ptr[ip] ; int nt = nt_d_ptr[ip]; @@ -369,13 +371,13 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix_sycl(SyclArray::array } } ) ; }); + q.wait() ; t0 = omp_get_wtime() ; std::cout<<"SetSample: Pvec,Tvec,Weight :"<* >& vec_spmatrix, std::vector& vec_impact){ const auto ib = m_pimpos.impact_binning(); @@ -563,6 +571,7 @@ void GenSycl::BinnedDiffusion_transform::get_charge_matrix(std::vector > >& vec_vec_charge, std::vector& vec_impact){ @@ -723,7 +732,14 @@ void GenSycl::BinnedDiffusion_transform::get_charge_vec(std::vector(size, q) ; uint64_t seed = 2020; +//#if defined(SYCL_TARGET_CUDA) || defined(SYCL_TARGET_HIP) +//#warning: SYCL_TARGET_CUDA || SYCL_TARGET_HIP generator_enum gen_type = generator_enum::philox; +//#else +//#warning: host-target +// generator_enum gen_type = generator_enum::mt19937 ; +//#endif + omp_get_rng_normal_double(normals_ptr, size, 0.0f, 1.0f, seed, gen_type); //oneapi::mkl::rng::philox4x32x10 engine(q, seed); //generate Uniform distribution @@ -748,7 +764,8 @@ void GenSycl::BinnedDiffusion_transform::get_charge_vec(std::vector g ) { - int ip = g.get_group_id() ; + //int ip = g.get_group_id() ; + int ip = g.get_id() ; const double sqrt2 = sqrt(2.0); int np = np_d_ptr[ip] ; int nt = nt_d_ptr[ip]; @@ -943,7 +960,7 @@ void GenSycl::BinnedDiffusion_transform::get_charge_vec(std::vector(t) << ", " << get<1>(t) << ", " << get<2>(t) << "\n"; + // std::cout << get<0>(t) << ", " << get<1>(t) << ", " << get<2>(t) << "\n"; // } // } //wend = omp_get_wtime(); @@ -989,7 +1006,10 @@ void GenSycl::BinnedDiffusion_transform::set_sampling_bat(const unsigned long np auto patch_d_ptr = patch_d.data() ; auto normals_ptr = normals.data() ; - q.parallel_for(sycl::nd_range<1>(npatches*wg_size, wg_size) , [=]( sycl::nd_item<1> item ) { + + q.submit( [&] ( sycl::handler &h ) { +// sycl::stream out(npatches*wg_size*2048,2048, h ) ; + h.parallel_for(sycl::nd_range<1>(npatches*wg_size, wg_size) , [=]( sycl::nd_item<1> item ) { auto wg = item.get_group() ; int ip = item.get_group_linear_id() ; int id = item.get_local_id(0) ; @@ -1003,34 +1023,41 @@ void GenSycl::BinnedDiffusion_transform::set_sampling_bat(const unsigned long np double gsum = 0.0 ; double lsum = 0.0 ; + double charge=gdata_ptr[ip].charge ; + double charge_abs = abs(charge) ; + + float p_v =0.0 ; for ( int it = 0 ; it < n_it ; it ++ ) { int ii = it * wg_size + id ; double v = ii < patch_size ? pvecs_d_ptr[ip*MAX_P_SIZE+ ii%np]*tvecs_d_ptr[ip * MAX_T_SIZE + ii/np] : 0.0 ; - lsum += v; - if( ii < patch_size) patch_d_ptr[ii + p0 ] = (float) v ; + if( ii()); + double r =charge/gsum ; + - double charge=gdata_ptr[ip].charge ; - double charge_abs = abs(charge) ; // normalize to total = charge ; for ( int it = 0 ; it < n_it ; it ++ ) { int ii = it * wg_size + id ; - if (ii < patch_size ) patch_d_ptr[ii+p0] *= float(charge/gsum) ; + if (ii < patch_size ) { + patch_d_ptr[ii + p0 ] *= (float)r ; + } } - if( fl ) { int n=(int) charge_abs; - gsum =0.0 ; + gsum =0.0 ; lsum =0.0 ; for ( int it = 0 ; it < n_it ; it ++ ) { int ii = it * wg_size + id ; if( ii < patch_size ) { - double p = patch_d_ptr[ii+p0]/charge ; - double q = 1-p ; + double p = (double)p_v/charge ; //normalized to total 1 + //if( p <0 || p>1.0 ) out<<"sqrt-: "<()); if( fl ) { for ( int it = 0 ; it < n_it ; it ++ ) { @@ -1047,8 +1075,10 @@ void GenSycl::BinnedDiffusion_transform::set_sampling_bat(const unsigned long np } } - } ) ; + } ) ; + } ) ; + q.wait() ; } diff --git a/src/ImpactTransform.cxx b/src/ImpactTransform.cxx index 56d6225..1057552 100644 --- a/src/ImpactTransform.cxx +++ b/src/ImpactTransform.cxx @@ -429,6 +429,8 @@ bool GenSycl::ImpactTransform::transform_matrix() f_data.free() ; q.wait(); SyclArray::array_xxc data_c(data_d.extent(0), data_d.extent(1) ) ; + + SyclArray::dft_cc(data_d, data_c, 1); auto data_c_ptr = data_c.data() ; @@ -448,14 +450,19 @@ bool GenSycl::ImpactTransform::transform_matrix() SyclArray::idft_cc(data_c,data_d, 1); data_c.free() ; +// std::cout<<"Size: data_d: " << data_d.extent(0)<<","<< data_d.extent(1)<<" ; acc_data_f_w: "<< acc_data_f_w.extent(0) <<";" << acc_data_f_w.extent(1) < acc_data_t_w_eigen((float*) acc_data_t_w_h, acc_data_t_w.extent(0), acc_data_t_w.extent(1)); m_decon_data = acc_data_t_w_eigen; // FIXME: reduce this copy - std::cout << "mdeconn_data: row, colum" << m_decon_data.rows()<<","< Date: Mon, 22 Aug 2022 17:34:01 -0400 Subject: [PATCH 5/5] Fix bug in set_sampling_batch kernel --- src/BinnedDiffusion_transform.cxx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/BinnedDiffusion_transform.cxx b/src/BinnedDiffusion_transform.cxx index b166463..c16b693 100644 --- a/src/BinnedDiffusion_transform.cxx +++ b/src/BinnedDiffusion_transform.cxx @@ -1026,7 +1026,6 @@ void GenSycl::BinnedDiffusion_transform::set_sampling_bat(const unsigned long np double charge=gdata_ptr[ip].charge ; double charge_abs = abs(charge) ; - float p_v =0.0 ; for ( int it = 0 ; it < n_it ; it ++ ) { int ii = it * wg_size + id ; @@ -1055,7 +1054,7 @@ void GenSycl::BinnedDiffusion_transform::set_sampling_bat(const unsigned long np for ( int it = 0 ; it < n_it ; it ++ ) { int ii = it * wg_size + id ; if( ii < patch_size ) { - double p = (double)p_v/charge ; //normalized to total 1 + double p = (double)patch_d_ptr[ii + p0 ]/charge ; //normalized to total 1 //if( p <0 || p>1.0 ) out<<"sqrt-: "<