diff --git a/CMakeLists.txt b/CMakeLists.txt index 2537b556a..e83478314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,12 +165,6 @@ else (ENABLE_SSE3) set(SSE3_MACRO "0") endif (ENABLE_SSE3) -if (QBCAPPOW GREATER 6) - set(QRACK_CUDA_COMPILE_OPTS -O3 -use_fast_math -Xcompiler -fpermissive --ptxas-options -O3,) -else (QBCAPPOW GREATER 6) - set(QRACK_CUDA_COMPILE_OPTS -O3 -use_fast_math -Werror all-warnings --ptxas-options -O3,) -endif (QBCAPPOW GREATER 6) - if (MSVC) if (CPP_STD GREATER_EQUAL 23) set(QRACK_CPP_STD_OPT /std:c++23) @@ -197,6 +191,12 @@ else (MSVC) endif () endif (MSVC) +if (QBCAPPOW GREATER 6) + set(QRACK_CUDA_COMPILE_OPTS -O3 -use_fast_math -Xcompiler -fpermissive ${QRACK_CPP_STD_OPT} --ptxas-options -O3,) +else (QBCAPPOW GREATER 6) + set(QRACK_CUDA_COMPILE_OPTS -O3 -use_fast_math -Werror all-warnings ${QRACK_CPP_STD_OPT} --ptxas-options -O3,) +endif (QBCAPPOW GREATER 6) + if (MSVC) set(QRACK_COMPILE_OPTS ${QRACK_CPP_STD_OPT} /Wall) set(TEST_COMPILE_OPTS ${QRACK_CPP_STD_OPT} /Wall) diff --git a/src/common/qengine.cl b/src/common/qengine.cl index 26cac73db..7ecb6cf86 100644 --- a/src/common/qengine.cl +++ b/src/common/qengine.cl @@ -588,7 +588,9 @@ void kernel decomposeprob(global cmplx* stateVec, constant bitCapIntOcl4* bitCap const cmplx amp = stateVec[j | (k << start)]; const real1 nrm = dot(amp, amp); partProb += nrm; - partStateAngle[k] += arg(amp) * nrm; + if (nrm > REAL1_EPSILON) { + partStateAngle[k] += arg(amp) * nrm; + } } remainderStateProb[lcv] = partProb; @@ -606,7 +608,9 @@ void kernel decomposeprob(global cmplx* stateVec, constant bitCapIntOcl4* bitCap const cmplx amp = stateVec[l]; const real1 nrm = dot(amp, amp); partProb += nrm; - remainderStateAngle[k] += arg(amp) * nrm; + if (nrm > REAL1_EPSILON) { + remainderStateAngle[k] += arg(amp) * nrm; + } } if (partProb > REAL1_EPSILON) { @@ -667,7 +671,10 @@ void kernel disposeprob(global cmplx* stateVec, constant bitCapIntOcl4* bitCapIn l |= j | ((k ^ l) << len); const cmplx amp = stateVec[l]; - remainderStateAngle[k] += arg(amp) * dot(amp, amp); + const real1 nrm = dot(amp, amp); + if (nrm > REAL1_EPSILON) { + remainderStateAngle[k] += arg(amp) * nrm; + } } } diff --git a/src/common/qengine.cu b/src/common/qengine.cu index c42589014..cfa1f37a4 100644 --- a/src/common/qengine.cu +++ b/src/common/qengine.cu @@ -677,7 +677,9 @@ __global__ void decomposeprob(qCudaCmplx* stateVec, bitCapIntOcl* bitCapIntOclPt const qCudaCmplx amp = stateVec[j | (k << start)]; const qCudaReal1_f nrm = (qCudaReal1_f)qCudaDot(amp, amp); partProb += nrm; - partStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + if (nrm > REAL1_EPSILON_CUDA) { + partStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + } } remainderStateProb[lcv] = partProb; @@ -695,7 +697,9 @@ __global__ void decomposeprob(qCudaCmplx* stateVec, bitCapIntOcl* bitCapIntOclPt const qCudaCmplx amp = stateVec[l]; const qCudaReal1_f nrm = (qCudaReal1_f)qCudaDot(amp, amp); partProb += nrm; - remainderStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + if (nrm > REAL1_EPSILON_CUDA) { + remainderStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + } } if (partProb > REAL1_EPSILON_CUDA) { @@ -734,8 +738,6 @@ __global__ void disposeprob(qCudaCmplx* stateVec, bitCapIntOcl* bitCapIntOclPtr, const bitLenInt start = (bitLenInt)bitCapIntOclPtr[2]; const bitCapIntOcl startMask = (1U << start) - 1U; const bitLenInt len = bitCapIntOclPtr[3]; - const qCudaReal1_f angleThresh = -8 * PI_R1_CUDA; - const qCudaReal1_f initAngle = -16 * PI_R1_CUDA; for (bitCapIntOcl lcv = ID; lcv < remainderPower; lcv += Nthreads) { bitCapIntOcl j = lcv & startMask; @@ -755,15 +757,15 @@ __global__ void disposeprob(qCudaCmplx* stateVec, bitCapIntOcl* bitCapIntOclPtr, for (bitCapIntOcl lcv = ID; lcv < partPower; lcv += Nthreads) { const bitCapIntOcl j = lcv << start; - qCudaReal1_f firstAngle = initAngle; - for (bitCapIntOcl k = 0U; k < remainderPower; ++k) { bitCapIntOcl l = k & startMask; l |= j | ((k ^ l) << len); const qCudaCmplx amp = stateVec[l]; const qCudaReal1_f nrm = (qCudaReal1_f)qCudaDot(amp, amp); - remainderStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + if (nrm > REAL1_EPSILON_CUDA) { + remainderStateAngle[k] += qCudaArg(amp) * (qCudaReal1)nrm; + } } } diff --git a/src/qengine/state.cpp b/src/qengine/state.cpp index bbca14d7c..b89c81bb7 100644 --- a/src/qengine/state.cpp +++ b/src/qengine/state.cpp @@ -1183,7 +1183,9 @@ void QEngineCPU::DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCPUP const complex amp = stateVec->read(j | (k << start)); const real1 nrm = norm(amp); remainderStateProb[lcv] += nrm; - partStateAngle[k] += arg(amp) * nrm; + if (nrm > REAL1_EPSILON) { + partStateAngle[k] += arg(amp) * nrm; + } } }); @@ -1198,7 +1200,9 @@ void QEngineCPU::DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCPUP const complex amp = stateVec->read(l); const real1 nrm = norm(amp); partStateProb[lcv] += nrm; - remainderStateAngle[k] += arg(amp) * nrm; + if (nrm > REAL1_EPSILON) { + remainderStateAngle[k] += arg(amp) * nrm; + } } const real1 prob = partStateProb[lcv]; @@ -1231,7 +1235,10 @@ void QEngineCPU::DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCPUP l |= j | ((k ^ l) << length); const complex amp = stateVec->read(l); - remainderStateAngle[k] += arg(amp) * norm(amp); + const real1 nrm = norm(amp); + if (nrm > REAL1_EPSILON) { + remainderStateAngle[k] += arg(amp) * nrm; + } } }); par_for(0U, remainderPower, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {