Skip to content

Commit

Permalink
Various GPU bug fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
himanshugoel2797 committed Jan 15, 2024
1 parent 02637c5 commit 3cfb632
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 22 deletions.
2 changes: 1 addition & 1 deletion cpp/gcc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ CUDA_LIBS = -L$(CUDA_PATH)/lib64 -L$(CUDA_MATHLIBS_PATH)/lib64

SRW_SRC_DEF += -D_OFFLOAD_GPU -DUSE_CUDA -D_FFTW3
SRW_INCLUDES += $(CUDA_INCLUDES)
SRW_CFLAGS += -std=c++17
#SRW_CFLAGS += -std=c++17 #HG01012024
LDFLAGS += $(CUDA_LIBS) -lcudart_static -lcudadevrt -lcufft -lrt
NVCFLAGS = -O3 -arch=sm_80 -dlto -rdc=true
CUDA_OBJ=gmfft_gpu.o srradstr_gpu.o sroptelm_gpu.o sroptdrf_gpu.o sroptgtr_gpu.o srradmnp_gpu.o
Expand Down
5 changes: 4 additions & 1 deletion cpp/py/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
if 'MODE' in os.environ:
sMode = str(os.environ['MODE'])
if sMode == 'cuda': # HG30112023
ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno']})
ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno', '-D_OFFLOAD_GPU']})
ext_kwargs['define_macros'].clear() #HG13012024
ext_kwargs['include_dirs'].append('{0}/include'.format(os.environ['CUDA_PATH'])) #HG13012024
ext_kwargs['include_dirs'].append('{0}/include'.format(os.environ['CUDA_MATHLIBS_PATH'])) #HG13012024
ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_PATH']))
ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_MATHLIBS_PATH']))
elif sMode == 'omp':
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/core/sroptel2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAcces

//if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE, pBuf)) return result; //from derived classes
//OC01102019 (restored)
if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE)) return result; //from derived classes
//if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE)) return result; //from derived classes
if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE, pvGPU)) return result; //from derived classes //HG12012024

if(pRadDataSingleE != pRadAccessData)
{
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/core/sroptelm.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ class srTGenOptElem : public CGenObject {
void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023
//void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&);
#ifdef _OFFLOAD_GPU //HG01122023
void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU);
void MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU); //HG13012024 Remove 'srTGenOptElem::' at the beginning of the function name to comply with the C++ standard
#endif

int SetupWfrEdgeCorrData1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/core/sroptgtr.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ class srTGenTransmission : public srTFocusingElem {
{
//if(ParPrecWfrPropag.AnalTreatment == 1)
//{// Treating linear terms analytically
pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y);
//pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y);
pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y, pvGPU); //HG12012024
//}

char &MethNo = ParPrecWfrPropag.MethNo;
Expand All @@ -99,7 +100,8 @@ class srTGenTransmission : public srTFocusingElem {

//if(ParPrecWfrPropag.AnalTreatment == 1)
//{// Treating linear terms analytically
if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin();
//if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin();
if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin(pvGPU); //HG12012024
//}

return result;
Expand Down
77 changes: 67 additions & 10 deletions cpp/src/core/srradmnp_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,33 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra

int Int_or_ReE = RadExtract.Int_or_Phase;
if (Int_or_ReE == 7) Int_or_ReE = 0; //OC150813: time/phot. energy integrated single-e intensity requires "normal" intensity here

if (Int_or_ReE != 2) //HG13012024 Fixed bug: Output array was not allocated properly
{
if (allStokesReq)
{
RadExtract.pExtractedData = (float*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedData, 4*RadAccessData.nx*RadAccessData.nz*sizeof(float), true);
//CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedData);
}
else
{
RadExtract.pExtractedData = (float*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedData, RadAccessData.nx*RadAccessData.nz*sizeof(float), true);
//CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedData);
}
}
else
{
if (allStokesReq)
{
RadExtract.pExtractedDataD = (double*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedDataD, 4*RadAccessData.nx*RadAccessData.nz*sizeof(double), true);
//CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedDataD);
}
else
{
RadExtract.pExtractedDataD = (double*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedDataD, RadAccessData.nx*RadAccessData.nz*sizeof(double), true);
//CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedDataD);
}
}

if (allStokesReq)
if (intOverEnIsRequired)
Expand All @@ -251,23 +278,53 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra
else
ExtractSingleElecIntensity2DvsXZ_GPUSub<false, false> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);

CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true);
CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true);
CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false);
CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false);
if(Int_or_ReE != 2) //HG13012024 Fixed bug: Output array was not allocated properly
{
if(RadExtract.pExtractedData != NULL)
CAuxGPU::MarkUpdated(pGPU, RadExtract.pExtractedData, true, false);
}
else
{
if(RadExtract.pExtractedDataD != NULL)
CAuxGPU::MarkUpdated(pGPU, RadExtract.pExtractedDataD, true, false);
}

#ifndef _DEBUG
if (RadAccessData.pBaseRadX != NULL)
RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX);
if (RadAccessData.pBaseRadZ != NULL)
RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ);
if(Int_or_ReE != 2)
{
if (RadExtract.pExtractedData != NULL)
RadExtract.pExtractedData = (float*)CAuxGPU::GetHostPtr(pGPU, RadExtract.pExtractedData);
}
else
{
if (RadExtract.pExtractedDataD != NULL)
RadExtract.pExtractedDataD = (double*)CAuxGPU::GetHostPtr(pGPU, RadExtract.pExtractedDataD);
}
#endif

#ifdef _DEBUG
if(Int_or_ReE != 2)
{
if (RadExtract.pExtractedData != NULL)
RadExtract.pExtractedData = (float*)CAuxGPU::ToHostAndFree(pGPU, RadExtract.pExtractedData, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
}
else
{
if (RadExtract.pExtractedDataD != NULL)
RadExtract.pExtractedDataD = (double*)CAuxGPU::ToHostAndFree(pGPU, RadExtract.pExtractedDataD, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(double));
}
#endif

CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true);
CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true);
//CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false);
//CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false);
if (RadAccessData.pBaseRadX != NULL)
RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float), true); //HG13012024 Original wavefront data does not need to be copied back to CPU
if (RadAccessData.pBaseRadZ != NULL)
RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float), true); //HG13012024 Original wavefront data does not need to be copied back to CPU

#ifdef _DEBUG
cudaStreamSynchronize(0);
auto err = cudaGetLastError();
printf("%s\r\n", cudaGetErrorString(err));
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/core/srradstr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2607,7 +2607,8 @@ void srTSRWRadStructAccessData::AddStokesAtPoint(srTEXZ& EXZ, float* pStokesVal)

//*************************************************************************

void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc)
//void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc)
void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc, void *pvGPU) //HG12012024
{
const double ratAllowSubtract = 0.2;
const double minNumOptCycles = 10;
Expand Down Expand Up @@ -2666,12 +2667,14 @@ void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, doub

if((xMult == 0) && (zMult == 0)) return;

MultiplyElFieldByPhaseLin(xMult, zMult);
//MultiplyElFieldByPhaseLin(xMult, zMult);
MultiplyElFieldByPhaseLin(xMult, zMult, pvGPU); //HG12012024
}

//*************************************************************************

void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin()
//void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin()
void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin(void* pvGPU) //HG12012024
{
if((!m_xLinOnlyPhaseTermWasSubtracted) && (!m_zLinOnlyPhaseTermWasSubtracted)) return;

Expand All @@ -2695,7 +2698,8 @@ void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin()

if((xMult == 0) && (zMult == 0)) return;

MultiplyElFieldByPhaseLin(xMult, zMult);
//MultiplyElFieldByPhaseLin(xMult, zMult);
MultiplyElFieldByPhaseLin(xMult, zMult, pvGPU); //HG12012024
}

//*************************************************************************
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/core/srradstr.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,10 @@ class srTSRWRadStructAccessData : public CGenObject {
int FindAverageDistanceToSource(srTTrjDat& TrjDat, double& Robs, double& RobsAbsErr, double& xElAtYsrc, double& zElAtYsrc, double* precPar);
void AddStokesAtPoint(srTEXZ& EXZ, float* pStokesVal);

void CheckAndSubtractPhaseTermsLin(double newXc, double newZc);
void CheckAndResetPhaseTermsLin();
//void CheckAndSubtractPhaseTermsLin(double newXc, double newZc);
void CheckAndSubtractPhaseTermsLin(double newXc, double newZc, void* pvGPU=0); //HG12012024
//void CheckAndResetPhaseTermsLin();
void CheckAndResetPhaseTermsLin(void* pvGPU=0); //HG12012024
void EstimateOversamplingFactors(double& estimOverSampX, double& estimOverSampZ);

void MirrorFieldData(int sx, int sz, void* pvGPU=0); //OC28072023
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/ext/genmath/gmfft.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ class CGenMathFFT2D : public CGenMathFFT {
#endif
}

#ifdef _OFFLOAD_GPU //HG13012024
~CGenMathFFT2D()
{
if(Plan2DFFT_cu != 0) cufftDestroy(Plan2DFFT_cu);
if(dPlan2DFFT_cu != 0) cufftDestroy(dPlan2DFFT_cu);
}
#endif

//int Make2DFFT(CGenMathFFT2DInfo&);
//Modification by S.Yakubov for parallelizing SRW via OpenMP:
#ifdef _FFTW3 //28012019
Expand Down Expand Up @@ -604,6 +612,14 @@ class CGenMathFFT1D : public CGenMathFFT {
#endif
}

#ifdef _OFFLOAD_GPU //HG13012024
~CGenMathFFT1D()
{
if(Plan1DFFT_cu != 0) cufftDestroy(Plan1DFFT_cu);
if(dPlan1DFFT_cu != 0) cufftDestroy(dPlan1DFFT_cu);
}
#endif

int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023

Expand Down

0 comments on commit 3cfb632

Please sign in to comment.