From bc4680fb12bdd9d6c6e442760f29bc88aa3115fe Mon Sep 17 00:00:00 2001 From: Lennart ten Wolde Date: Sun, 11 Feb 2018 00:18:43 +0100 Subject: [PATCH 1/4] experimental allium implementation --- algos.h | 2 + allium.cu | 532 ++++++++++++++++++++++++++++++++++++++++ allium0.cu | 298 ++++++++++++++++++++++ bench.cpp | 1 + ccminer.cpp | 6 + ccminer.vcxproj | 3 +- ccminer.vcxproj.filters | 5 +- miner.h | 3 + util.cpp | 3 + 9 files changed, 851 insertions(+), 2 deletions(-) create mode 100644 allium.cu create mode 100644 allium0.cu diff --git a/algos.h b/algos.h index c77b5e829a..f12a03a10f 100644 --- a/algos.h +++ b/algos.h @@ -30,6 +30,7 @@ enum sha_algos { ALGO_LBRY, ALGO_LUFFA, ALGO_LYRA2, + ALGO_ALLIUM, ALGO_LYRA2v2, ALGO_LYRA2Z, ALGO_MJOLLNIR, /* Hefty hash */ @@ -99,6 +100,7 @@ static const char *algo_names[] = { "lbry", "luffa", "lyra2", + "allium", "lyra2v2", "lyra2z", "mjollnir", diff --git a/allium.cu b/allium.cu new file mode 100644 index 0000000000..ce719dd4c9 --- /dev/null +++ b/allium.cu @@ -0,0 +1,532 @@ +extern "C" { +//#include +#include "lyra2/Lyra2.h" +} + +#include +#include +//#include +#include + +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti); + +//extern void blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7); +//extern void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce); +//extern void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t ptarget7); + +// Blake2s + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#endif + +#include "cuda_helper.h" + +#ifdef __CUDA_ARCH__ + +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a) { + return __byte_perm(a, 0, 0x0321); +} + +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a) { + return __byte_perm(a, 0, 0x1032); +} + +#else +#define ROR8(u) (u >> 8) +#define ROL16(u) (u << 16) +#endif + +__device__ __forceinline__ +uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t result; +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b), "r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA +#else + result = a^b^c; +#endif + return result; +} + +static const uint32_t blake2s_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + + +#define TPB 1024 +#define NPT 256 +#define maxResults 16 +#define NBN 1 +__constant__ uint32_t _ALIGN(32) midstate[20]; + +static uint32_t *d_resNonce[MAX_GPUS]; +static uint32_t *h_resNonce[MAX_GPUS]; + +static const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; + +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = SPH_ROTR32(d ^ a, 16); \ + c = c + d; \ + b = SPH_ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = SPH_ROTR32(d ^ a, 8); \ + c = c + d; \ + b = SPH_ROTR32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[0],v[4],v[ 8],v[12]); \ + G(r,1,v[1],v[5],v[ 9],v[13]); \ + G(r,2,v[2],v[6],v[10],v[14]); \ + G(r,3,v[3],v[7],v[11],v[15]); \ + G(r,4,v[0],v[5],v[10],v[15]); \ + G(r,5,v[1],v[6],v[11],v[12]); \ + G(r,6,v[2],v[7],v[ 8],v[13]); \ + G(r,7,v[3],v[4],v[ 9],v[14]); \ + } while(0) + +#define GS4(a,b,c,d,e,f,a1,b1,c1,d1,e1,f1,a2,b2,c2,d2,e2,f2,a3,b3,c3,d3,e3,f3){ \ + a += b + e; a1+= b1 + e1; a2+= b2 + e2; a3+= b3 + e3; \ + d = ROL16( d ^ a); d1 = ROL16(d1 ^ a1); d2 = ROL16(d2 ^ a2); d3 = ROL16(d3 ^ a3); \ + c +=d; c1+=d1; c2+=d2; c3+=d3;\ + b = ROTR32(b ^ c, 12); b1 = ROTR32(b1^c1, 12); b2 = ROTR32(b2^c2, 12); b3 = ROTR32(b3^c3, 12); \ + a += b + f; a1+= b1 + f1; a2+= b2 + f2; a3+= b3 + f3; \ + d = ROR8(d ^ a); d1 = ROR8(d1^a1); d2 = ROR8(d2^a2); d3 = ROR8(d3^a3); \ + c += d; c1 += d1; c2 += d2; c3 += d3;\ + b = ROTR32(b ^ c, 7); b1 = ROTR32(b1^c1, 7); b2 = ROTR32(b2^c2, 7); b3 = ROTR32(b3^c3, 7); \ + } + + +static void allium_blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7) +{ + uint32_t _ALIGN(64) m[16]; + uint32_t _ALIGN(64) v[16]; + uint32_t _ALIGN(64) h[21]; + + // COMPRESS + for (int i = 0; i < 16; ++i) + m[i] = input[i]; + + h[0] = 0x01010020 ^ blake2s_IV[0]; + h[1] = blake2s_IV[1]; + h[2] = blake2s_IV[2]; h[3] = blake2s_IV[3]; + h[4] = blake2s_IV[4]; h[5] = blake2s_IV[5]; + h[6] = blake2s_IV[6]; h[7] = blake2s_IV[7]; + + for (int i = 0; i < 8; ++i) + v[i] = h[i]; + + v[8] = blake2s_IV[0]; v[9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; v[11] = blake2s_IV[3]; + v[12] = 64 ^ blake2s_IV[4]; v[13] = blake2s_IV[5]; + v[14] = blake2s_IV[6]; v[15] = blake2s_IV[7]; + + ROUND(0); ROUND(1); + ROUND(2); ROUND(3); + ROUND(4); ROUND(5); + ROUND(6); ROUND(7); + ROUND(8); ROUND(9); + + for (int i = 0; i < 8; ++i) + h[i] ^= v[i] ^ v[i + 8]; + + h[16] = input[16]; + h[17] = input[17]; + h[18] = input[18]; + + h[8] = 0x6A09E667; h[9] = 0xBB67AE85; + h[10] = 0x3C6EF372; h[11] = 0xA54FF53A; + h[12] = 0x510E522F; h[13] = 0x9B05688C; + h[14] = ~0x1F83D9AB; h[15] = 0x5BE0CD19; + + h[0] += h[4] + h[16]; + h[12] = SPH_ROTR32(h[12] ^ h[0], 16); + h[8] += h[12]; + h[4] = SPH_ROTR32(h[4] ^ h[8], 12); + h[0] += h[4] + h[17]; + h[12] = SPH_ROTR32(h[12] ^ h[0], 8); + h[8] += h[12]; + h[4] = SPH_ROTR32(h[4] ^ h[8], 7); + + h[1] += h[5] + h[18]; + h[13] = SPH_ROTR32(h[13] ^ h[1], 16); + h[9] += h[13]; + h[5] = ROTR32(h[5] ^ h[9], 12); + + h[2] += h[6]; + h[14] = SPH_ROTR32(h[14] ^ h[2], 16); + h[10] += h[14]; + h[6] = SPH_ROTR32(h[6] ^ h[10], 12); + h[2] += h[6]; + h[14] = SPH_ROTR32(h[14] ^ h[2], 8); + h[10] += h[14]; + h[6] = SPH_ROTR32(h[6] ^ h[10], 7); + + h[19] = h[7]; //constant h[7] for nonce check + + h[3] += h[7]; + h[15] = SPH_ROTR32(h[15] ^ h[3], 16); + h[11] += h[15]; + h[7] = SPH_ROTR32(h[7] ^ h[11], 12); + h[3] += h[7]; + h[15] = SPH_ROTR32(h[15] ^ h[3], 8); + h[11] += h[15]; + h[7] = SPH_ROTR32(h[7] ^ h[11], 7); + + h[1] += h[5]; + h[3] += h[4]; + h[14] = SPH_ROTR32(h[14] ^ h[3], 16); + + h[2] += h[7]; + if (ptarget7 == 0){ + h[19] = SPH_ROTL32(h[19], 7); //align the rotation with v[7] v[15]; + } + cudaMemcpyToSymbol(midstate, h, 20 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} + +__global__ __launch_bounds__(TPB, 1) +void allium_blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce) +{ + const uint32_t step = gridDim.x * blockDim.x; + + uint32_t m[3]; + uint32_t v[16]; + + m[0] = midstate[16]; + m[1] = midstate[17]; + m[2] = midstate[18]; + + const uint32_t h7 = midstate[19]; + + for (uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; thread data; + uint32_t *ptarget = work->target; + uint32_t _ALIGN(64) endiandata[20]; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + + int dev_id = device_map[thr_id]; + int rc = 0; + + // blake2s + uint32_t *resNonces; + + if (opt_benchmark) + ptarget[7] = 0x006fff; + + const dim3 grid((throughput + (NPT*TPB) - 1) / (NPT*TPB)); + const dim3 block(TPB); + + static __thread bool gtx750ti; + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + + cuda_get_arch(thr_id); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1); + CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1); + + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; + if (device_sm[dev_id] <= 500) intensity = 15; + throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); + gtx750ti = (strstr(props.name, "750 Ti") != NULL); + + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + if (device_sm[dev_id] >= 500) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + init[thr_id] = true; + } + resNonces = h_resNonce[thr_id]; + + for (int k = 0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + allium_blake2s_setBlock(endiandata, ptarget[7]); + + uint32_t _ALIGN(64) hash[8]; + do { + be32enc(&endiandata[19], nonce); + + if (ptarget[7]) { + allium_blake2s_gpu_hash_nonce<<>>(throughput, nonce, d_resNonce[thr_id], ptarget[7]); + } + else { + allium_blake2s_gpu_hash_nonce<<>>(throughput, nonce, d_resNonce[thr_id]); + } + + + be32enc(&d_hash[thr_id], (uint32_t) d_resNonce[thr_id]); + //d_hash[thr_id] = (uint32_t)d_resNonce[thr_id]; + + lyra2_cpu_hash_32(thr_id, throughput, nonce, d_hash[thr_id], gtx750ti); + + cudaMemcpy(resNonces, d_hash[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + if (resNonces[0]) + { + cudaMemcpy(resNonces, d_hash[thr_id], maxResults*sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemset(d_hash[thr_id], 0x00, sizeof(uint32_t)); + + if (resNonces[0] >= maxResults) { + gpulog(LOG_WARNING, thr_id, "candidates flood: %u", resNonces[0]); + resNonces[0] = maxResults - 1; + } + + nonce = sph_bswap32(resNonces[1]); + be32enc(&endiandata[19], nonce); + allium_hash(hash, endiandata); + + if (hash[7] <= Htarg && fulltest(hash, ptarget)) { + gpulog(LOG_INFO, thr_id, "Found valid nonce"); + work->nonces[0] = nonce; + work->valid_nonces = 1; + work_set_target_ratio(work, hash); + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return work->valid_nonces; + } + } + + if (nonce + throughput > max_nonce) { + nonce = max_nonce; + break; + } + + nonce += throughput; + } while (!work_restart[thr_id].restart); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + + return 0; +} + +// cleanup +extern "C" void free_allium(int thr_id) +{ + int dev_id = device_map[thr_id]; + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + if (device_sm[dev_id] >= 350) + cudaFree(d_matrix[thr_id]); + //lyra2Z_cpu_free(thr_id); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/allium0.cu b/allium0.cu new file mode 100644 index 0000000000..87cbdd0a32 --- /dev/null +++ b/allium0.cu @@ -0,0 +1,298 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "lyra2/Lyra2.h" +#include "sph/blake2s.h" +} + +#include +#include + +static uint64_t* d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +//extern void blake256_cpu_init(int thr_id, uint32_t threads); +//extern void blake256_cpu_setBlock_80(uint32_t *pdata); +//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + +//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void keccak256_sm3_init(int thr_id, uint32_t threads); +//extern void keccak256_sm3_free(int thr_id); + +//extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); +// +//extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void skein256_cpu_init(int thr_id, uint32_t threads); + +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); + +//extern void groestl256_cpu_init(int thr_id, uint32_t threads); +//extern void groestl256_cpu_free(int thr_id); +//extern void groestl256_setTarget(const void *ptarget); +//extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); +//extern uint32_t groestl256_getSecNonce(int thr_id, int num); +__constant__ uint32_t _ALIGN(32) midstate[20]; + +extern "C" void allium_hash(void *state, const void *input) +{ + uint32_t hashB[8]; + uint32_t hashA[8]; + + //sph_blake256_context ctx_blake; + //sph_keccak256_context ctx_keccak; + //sph_skein256_context ctx_skein; + //sph_groestl256_context ctx_groestl; + + //sph_blake256_set_rounds(14); + + //sph_blake256_init(&ctx_blake); + //sph_blake256(&ctx_blake, input, 80); + //sph_blake256_close(&ctx_blake, hashA); + + //sph_keccak256_init(&ctx_keccak); + //sph_keccak256(&ctx_keccak, hashA, 32); + //sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2(hashB, 32, input, 32, input, 32, 1, 8, 8); + blake2s_hash(hashA, hashB); + + //blake2s(out, in, NULL, 32, inlen, 0) + + //blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) + //blake2s_simple(hashA, hashB, 32); + //blake2s_hash(hashA, hashB); + + //sph_skein256_init(&ctx_skein); + //sph_skein256(&ctx_skein, hashA, 32); + //sph_skein256_close(&ctx_skein, hashB); + + //sph_groestl256_init(&ctx_groestl); + //sph_groestl256(&ctx_groestl, hashB, 32); + //sph_groestl256_close(&ctx_groestl, hashA); + + memcpy(state, hashA, 32); +} + +static void blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7) +{ + uint32_t _ALIGN(64) m[16]; + uint32_t _ALIGN(64) v[16]; + uint32_t _ALIGN(64) h[21]; + + // COMPRESS + for (int i = 0; i < 16; ++i) + m[i] = input[i]; + + h[0] = 0x01010020 ^ blake2s_IV[0]; + h[1] = blake2s_IV[1]; + h[2] = blake2s_IV[2]; h[3] = blake2s_IV[3]; + h[4] = blake2s_IV[4]; h[5] = blake2s_IV[5]; + h[6] = blake2s_IV[6]; h[7] = blake2s_IV[7]; + + for (int i = 0; i < 8; ++i) + v[i] = h[i]; + + v[8] = blake2s_IV[0]; v[9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; v[11] = blake2s_IV[3]; + v[12] = 64 ^ blake2s_IV[4]; v[13] = blake2s_IV[5]; + v[14] = blake2s_IV[6]; v[15] = blake2s_IV[7]; + + ROUND(0); ROUND(1); + ROUND(2); ROUND(3); + ROUND(4); ROUND(5); + ROUND(6); ROUND(7); + ROUND(8); ROUND(9); + + for (int i = 0; i < 8; ++i) + h[i] ^= v[i] ^ v[i + 8]; + + h[16] = input[16]; + h[17] = input[17]; + h[18] = input[18]; + + h[8] = 0x6A09E667; h[9] = 0xBB67AE85; + h[10] = 0x3C6EF372; h[11] = 0xA54FF53A; + h[12] = 0x510E522F; h[13] = 0x9B05688C; + h[14] = ~0x1F83D9AB; h[15] = 0x5BE0CD19; + + h[0] += h[4] + h[16]; + h[12] = SPH_ROTR32(h[12] ^ h[0], 16); + h[8] += h[12]; + h[4] = SPH_ROTR32(h[4] ^ h[8], 12); + h[0] += h[4] + h[17]; + h[12] = SPH_ROTR32(h[12] ^ h[0], 8); + h[8] += h[12]; + h[4] = SPH_ROTR32(h[4] ^ h[8], 7); + + h[1] += h[5] + h[18]; + h[13] = SPH_ROTR32(h[13] ^ h[1], 16); + h[9] += h[13]; + h[5] = ROTR32(h[5] ^ h[9], 12); + + h[2] += h[6]; + h[14] = SPH_ROTR32(h[14] ^ h[2], 16); + h[10] += h[14]; + h[6] = SPH_ROTR32(h[6] ^ h[10], 12); + h[2] += h[6]; + h[14] = SPH_ROTR32(h[14] ^ h[2], 8); + h[10] += h[14]; + h[6] = SPH_ROTR32(h[6] ^ h[10], 7); + + h[19] = h[7]; //constant h[7] for nonce check + + h[3] += h[7]; + h[15] = SPH_ROTR32(h[15] ^ h[3], 16); + h[11] += h[15]; + h[7] = SPH_ROTR32(h[7] ^ h[11], 12); + h[3] += h[7]; + h[15] = SPH_ROTR32(h[15] ^ h[3], 8); + h[11] += h[15]; + h[7] = SPH_ROTR32(h[7] ^ h[11], 7); + + h[1] += h[5]; + h[3] += h[4]; + h[14] = SPH_ROTR32(h[14] ^ h[3], 16); + + h[2] += h[7]; + if (ptarget7 == 0){ + h[19] = SPH_ROTL32(h[19], 7); //align the rotation with v[7] v[15]; + } + cudaMemcpyToSymbol(midstate, h, 20 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} + +static bool init[MAX_GPUS] = { 0 }; +static __thread uint32_t throughput = 0; + +extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + + if (opt_benchmark) + ptarget[7] = 0x00ff; + + static __thread bool gtx750ti; + if (!init[thr_id]) + { + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); + CUDA_LOG_ERROR(); + + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; + if (device_sm[device_map[thr_id]] == 500) intensity = 15; + throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); + + if (strstr(props.name, "750 Ti")) gtx750ti = true; + else gtx750ti = false; + + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + //blake256_cpu_init(thr_id, throughput); + //keccak256_sm3_init(thr_id, throughput); + //skein256_cpu_init(thr_id, throughput); + //groestl256_cpu_init(thr_id, throughput); + + //cuda_get_arch(thr_id); + if (device_sm[dev_id] >= 500) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + init[thr_id] = true; + } + + uint32_t _ALIGN(128) endiandata[20]; + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + //blake256_cpu_setBlock_80(pdata); + //groestl256_setTarget(ptarget); + + do { + int order = 0; + + //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + //skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + *hashes_done = pdata[19] - first_nonce + throughput; + + //work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + work->nonces[0] = lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], work->nonces[0]); + allium_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + //work->nonces[1] = groestl256_getSecNonce(thr_id, 1); + if (work->nonces[1] != UINT32_MAX) { + be32enc(&endiandata[19], work->nonces[1]); + allium_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } + else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_allium(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_matrix[thr_id]); + + //keccak256_sm3_free(thr_id); + groestl256_cpu_free(thr_id); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/bench.cpp b/bench.cpp index baa999dcb1..ac3466d167 100644 --- a/bench.cpp +++ b/bench.cpp @@ -49,6 +49,7 @@ void bench_free() void algo_free_all(int thr_id) { // only initialized algos will be freed + free_allium(thr_id); free_bastion(thr_id); free_bitcore(thr_id); free_blake256(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 6c6d33f64a..47b7aea90e 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -236,6 +236,7 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ + allium Lyra2 blake2s\n\ bastion Hefty bastion\n\ bitcore Timetravel-10\n\ blake Blake 256 (SFR)\n\ @@ -1703,6 +1704,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_LBRY: case ALGO_LYRA2v2: case ALGO_LYRA2Z: + case ALGO_ALLIUM: case ALGO_TIMETRAVEL: case ALGO_BITCORE: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); @@ -2255,6 +2257,7 @@ static void *miner_thread(void *userdata) break; case ALGO_LYRA2: case ALGO_LYRA2Z: + case ALGO_ALLIUM: case ALGO_NEOSCRYPT: case ALGO_SIB: case ALGO_SCRYPT: @@ -2412,6 +2415,9 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2Z: rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_ALLIUM: + rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_NEOSCRYPT: rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 2a37505893..bf8efa4822 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -273,6 +273,7 @@ 76 + 64 @@ -615,4 +616,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a508e2222c..b61159f410 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -967,6 +967,9 @@ Source Files\equi + + Source Files\CUDA + @@ -983,4 +986,4 @@ Ressources - + \ No newline at end of file diff --git a/miner.h b/miner.h index 0ad85230de..12f22c0342 100644 --- a/miner.h +++ b/miner.h @@ -273,6 +273,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); struct work; +extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -336,6 +337,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc /* free device allocated memory per algo */ void algo_free_all(int thr_id); +extern void free_allium(int thr_id); extern void free_bastion(int thr_id); extern void free_bitcore(int thr_id); extern void free_blake256(int thr_id); @@ -881,6 +883,7 @@ void applog_hash64(void *hash); void applog_compare_hash(void *hash, void *hash_ref); void print_hash_tests(void); +void allium_hash(void *output, const void *input); void bastionhash(void* output, const unsigned char* input); void blake256hash(void *output, const void *input, int8_t rounds); void blake2b_hash(void *output, const void *input); diff --git a/util.cpp b/util.cpp index 152093d9ae..17d47600ec 100644 --- a/util.cpp +++ b/util.cpp @@ -2160,6 +2160,9 @@ void print_hash_tests(void) printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); + allium_hash(&hash[0], &buf[0]); + printpfx("allium", hash); + bastionhash(&hash[0], &buf[0]); printpfx("bastion", hash); From b97ad50dafa2843b43ca66fbda6fb72a5009efb0 Mon Sep 17 00:00:00 2001 From: Lennart ten Wolde Date: Sun, 11 Feb 2018 03:26:12 +0100 Subject: [PATCH 2/4] Fix compilation errors --- Makefile.am | 1 + allium.cu | 58 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/Makefile.am b/Makefile.am index 01fc6b140e..85ac64ed50 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,6 +24,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ nvsettings.cpp \ equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \ equi/equihash.cpp equi/cuda_equi.cu \ + allium.cu \ heavy/heavy.cu \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \ diff --git a/allium.cu b/allium.cu index ce719dd4c9..864dc42066 100644 --- a/allium.cu +++ b/allium.cu @@ -391,6 +391,8 @@ static bool init[MAX_GPUS] = { 0 }; static __thread uint32_t throughput = 0; static __thread bool gtx750ti = false; +static uint32_t *h_GNonces[16]; // this need to get fixed as the rest of that routine + extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; @@ -398,7 +400,6 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce uint32_t _ALIGN(64) endiandata[20]; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; int dev_id = device_map[thr_id]; int rc = 0; @@ -446,67 +447,73 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + // nonce + cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); + init[thr_id] = true; } resNonces = h_resNonce[thr_id]; for (int k = 0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); + allium_blake2s_setBlock(endiandata, ptarget[7]); + cudaMemset(d_resNonce[thr_id], 0x00, maxResults*sizeof(uint32_t)); uint32_t _ALIGN(64) hash[8]; + do { - be32enc(&endiandata[19], nonce); + //be32enc(&endiandata[19], nonce); if (ptarget[7]) { - allium_blake2s_gpu_hash_nonce<<>>(throughput, nonce, d_resNonce[thr_id], ptarget[7]); + allium_blake2s_gpu_hash_nonce<<>>(throughput, pdata[19], d_resNonce[thr_id], ptarget[7]); } else { - allium_blake2s_gpu_hash_nonce<<>>(throughput, nonce, d_resNonce[thr_id]); + allium_blake2s_gpu_hash_nonce<<>>(throughput, pdata[19], d_resNonce[thr_id]); } + *hashes_done = pdata[19] - first_nonce + throughput; - be32enc(&d_hash[thr_id], (uint32_t) d_resNonce[thr_id]); - //d_hash[thr_id] = (uint32_t)d_resNonce[thr_id]; - - lyra2_cpu_hash_32(thr_id, throughput, nonce, d_hash[thr_id], gtx750ti); + cudaMemcpy(&d_hash[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyHostToHost); - cudaMemcpy(resNonces, d_hash[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); - if (resNonces[0]) + cudaMemcpy(h_GNonces[thr_id], d_hash[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + work->nonces[0] = *h_GNonces[thr_id]; + if (work->nonces[0]) { - cudaMemcpy(resNonces, d_hash[thr_id], maxResults*sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemset(d_hash[thr_id], 0x00, sizeof(uint32_t)); + //gpulog(LOG_INFO, thr_id, "Running on nonce %u", work->nonces[0]); + //cudaMemcpy(resNonces, d_hash[thr_id], maxResults*sizeof(uint32_t), cudaMemcpyDeviceToHost); + //cudaMemset(d_hash[thr_id], 0x00, sizeof(uint32_t)); - if (resNonces[0] >= maxResults) { - gpulog(LOG_WARNING, thr_id, "candidates flood: %u", resNonces[0]); - resNonces[0] = maxResults - 1; - } + //if (resNonces[0] >= maxResults) { + // gpulog(LOG_WARNING, thr_id, "candidates flood: %u", resNonces[0]); + // resNonces[0] = maxResults - 1; + //} - nonce = sph_bswap32(resNonces[1]); - be32enc(&endiandata[19], nonce); + pdata[19] = work->nonces[0]; + be32enc(&endiandata[19], pdata[19]); allium_hash(hash, endiandata); if (hash[7] <= Htarg && fulltest(hash, ptarget)) { gpulog(LOG_INFO, thr_id, "Found valid nonce"); - work->nonces[0] = nonce; + //work->nonces[0] = pdata[19]; work->valid_nonces = 1; work_set_target_ratio(work, hash); - pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; return work->valid_nonces; } } - if (nonce + throughput > max_nonce) { - nonce = max_nonce; + if (pdata[19] + throughput > max_nonce) { + pdata[19] = max_nonce; break; } - nonce += throughput; + pdata[19] += throughput; } while (!work_restart[thr_id].restart); - pdata[19] = nonce; + //pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce + 1; return 0; @@ -524,7 +531,8 @@ extern "C" void free_allium(int thr_id) cudaFree(d_hash[thr_id]); if (device_sm[dev_id] >= 350) cudaFree(d_matrix[thr_id]); - //lyra2Z_cpu_free(thr_id); + // nonce + cudaFreeHost(h_GNonces[thr_id]); init[thr_id] = false; From 7882b2b369b70df071e24ca213a8f841ba9c5a40 Mon Sep 17 00:00:00 2001 From: Lennart ten Wolde Date: Sun, 11 Feb 2018 17:29:37 +0100 Subject: [PATCH 3/4] Implement final allium spec --- allium.cu | 550 +++++++++++------------------------------------------- 1 file changed, 112 insertions(+), 438 deletions(-) diff --git a/allium.cu b/allium.cu index 864dc42066..5cad1d7975 100644 --- a/allium.cu +++ b/allium.cu @@ -1,443 +1,119 @@ extern "C" { -//#include +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "sph/sph_cubehash.h" #include "lyra2/Lyra2.h" } #include #include -//#include -#include -extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti); - -//extern void blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7); -//extern void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce); -//extern void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t ptarget7); - -// Blake2s - -#ifdef __INTELLISENSE__ -#define __byte_perm(x, y, b) x -#endif - -#include "cuda_helper.h" - -#ifdef __CUDA_ARCH__ +static uint64_t* d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; -__device__ __forceinline__ -uint32_t ROR8(const uint32_t a) { - return __byte_perm(a, 0, 0x0321); -} +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -__device__ __forceinline__ -uint32_t ROL16(const uint32_t a) { - return __byte_perm(a, 0, 0x1032); -} +//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void keccak256_sm3_init(int thr_id, uint32_t threads); +//extern void keccak256_sm3_free(int thr_id); -#else -#define ROR8(u) (u >> 8) -#define ROL16(u) (u << 16) -#endif +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -__device__ __forceinline__ -uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) -{ - uint32_t result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b), "r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA -#else - result = a^b^c; -#endif - return result; -} +extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void skein256_cpu_init(int thr_id, uint32_t threads); -static const uint32_t blake2s_IV[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - - -#define TPB 1024 -#define NPT 256 -#define maxResults 16 -#define NBN 1 -__constant__ uint32_t _ALIGN(32) midstate[20]; - -static uint32_t *d_resNonce[MAX_GPUS]; -static uint32_t *h_resNonce[MAX_GPUS]; - -static const uint8_t blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, -}; - -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2s_sigma[r][2*i+0]]; \ - d = SPH_ROTR32(d ^ a, 16); \ - c = c + d; \ - b = SPH_ROTR32(b ^ c, 12); \ - a = a + b + m[blake2s_sigma[r][2*i+1]]; \ - d = SPH_ROTR32(d ^ a, 8); \ - c = c + d; \ - b = SPH_ROTR32(b ^ c, 7); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[0],v[4],v[ 8],v[12]); \ - G(r,1,v[1],v[5],v[ 9],v[13]); \ - G(r,2,v[2],v[6],v[10],v[14]); \ - G(r,3,v[3],v[7],v[11],v[15]); \ - G(r,4,v[0],v[5],v[10],v[15]); \ - G(r,5,v[1],v[6],v[11],v[12]); \ - G(r,6,v[2],v[7],v[ 8],v[13]); \ - G(r,7,v[3],v[4],v[ 9],v[14]); \ - } while(0) - -#define GS4(a,b,c,d,e,f,a1,b1,c1,d1,e1,f1,a2,b2,c2,d2,e2,f2,a3,b3,c3,d3,e3,f3){ \ - a += b + e; a1+= b1 + e1; a2+= b2 + e2; a3+= b3 + e3; \ - d = ROL16( d ^ a); d1 = ROL16(d1 ^ a1); d2 = ROL16(d2 ^ a2); d3 = ROL16(d3 ^ a3); \ - c +=d; c1+=d1; c2+=d2; c3+=d3;\ - b = ROTR32(b ^ c, 12); b1 = ROTR32(b1^c1, 12); b2 = ROTR32(b2^c2, 12); b3 = ROTR32(b3^c3, 12); \ - a += b + f; a1+= b1 + f1; a2+= b2 + f2; a3+= b3 + f3; \ - d = ROR8(d ^ a); d1 = ROR8(d1^a1); d2 = ROR8(d2^a2); d3 = ROR8(d3^a3); \ - c += d; c1 += d1; c2 += d2; c3 += d3;\ - b = ROTR32(b ^ c, 7); b1 = ROTR32(b1^c1, 7); b2 = ROTR32(b2^c2, 7); b3 = ROTR32(b3^c3, 7); \ - } +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); +extern void groestl256_cpu_init(int thr_id, uint32_t threads); +extern void groestl256_cpu_free(int thr_id); +extern void groestl256_setTarget(const void *ptarget); +extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); +extern uint32_t groestl256_getSecNonce(int thr_id, int num); -static void allium_blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7) -{ - uint32_t _ALIGN(64) m[16]; - uint32_t _ALIGN(64) v[16]; - uint32_t _ALIGN(64) h[21]; - - // COMPRESS - for (int i = 0; i < 16; ++i) - m[i] = input[i]; - - h[0] = 0x01010020 ^ blake2s_IV[0]; - h[1] = blake2s_IV[1]; - h[2] = blake2s_IV[2]; h[3] = blake2s_IV[3]; - h[4] = blake2s_IV[4]; h[5] = blake2s_IV[5]; - h[6] = blake2s_IV[6]; h[7] = blake2s_IV[7]; - - for (int i = 0; i < 8; ++i) - v[i] = h[i]; - - v[8] = blake2s_IV[0]; v[9] = blake2s_IV[1]; - v[10] = blake2s_IV[2]; v[11] = blake2s_IV[3]; - v[12] = 64 ^ blake2s_IV[4]; v[13] = blake2s_IV[5]; - v[14] = blake2s_IV[6]; v[15] = blake2s_IV[7]; - - ROUND(0); ROUND(1); - ROUND(2); ROUND(3); - ROUND(4); ROUND(5); - ROUND(6); ROUND(7); - ROUND(8); ROUND(9); - - for (int i = 0; i < 8; ++i) - h[i] ^= v[i] ^ v[i + 8]; - - h[16] = input[16]; - h[17] = input[17]; - h[18] = input[18]; - - h[8] = 0x6A09E667; h[9] = 0xBB67AE85; - h[10] = 0x3C6EF372; h[11] = 0xA54FF53A; - h[12] = 0x510E522F; h[13] = 0x9B05688C; - h[14] = ~0x1F83D9AB; h[15] = 0x5BE0CD19; - - h[0] += h[4] + h[16]; - h[12] = SPH_ROTR32(h[12] ^ h[0], 16); - h[8] += h[12]; - h[4] = SPH_ROTR32(h[4] ^ h[8], 12); - h[0] += h[4] + h[17]; - h[12] = SPH_ROTR32(h[12] ^ h[0], 8); - h[8] += h[12]; - h[4] = SPH_ROTR32(h[4] ^ h[8], 7); - - h[1] += h[5] + h[18]; - h[13] = SPH_ROTR32(h[13] ^ h[1], 16); - h[9] += h[13]; - h[5] = ROTR32(h[5] ^ h[9], 12); - - h[2] += h[6]; - h[14] = SPH_ROTR32(h[14] ^ h[2], 16); - h[10] += h[14]; - h[6] = SPH_ROTR32(h[6] ^ h[10], 12); - h[2] += h[6]; - h[14] = SPH_ROTR32(h[14] ^ h[2], 8); - h[10] += h[14]; - h[6] = SPH_ROTR32(h[6] ^ h[10], 7); - - h[19] = h[7]; //constant h[7] for nonce check - - h[3] += h[7]; - h[15] = SPH_ROTR32(h[15] ^ h[3], 16); - h[11] += h[15]; - h[7] = SPH_ROTR32(h[7] ^ h[11], 12); - h[3] += h[7]; - h[15] = SPH_ROTR32(h[15] ^ h[3], 8); - h[11] += h[15]; - h[7] = SPH_ROTR32(h[7] ^ h[11], 7); - - h[1] += h[5]; - h[3] += h[4]; - h[14] = SPH_ROTR32(h[14] ^ h[3], 16); - - h[2] += h[7]; - if (ptarget7 == 0){ - h[19] = SPH_ROTL32(h[19], 7); //align the rotation with v[7] v[15]; - } - cudaMemcpyToSymbol(midstate, h, 20 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); -} +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); -__global__ __launch_bounds__(TPB, 1) -void allium_blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce) +extern "C" void allium_hash(void *state, const void *input) { - const uint32_t step = gridDim.x * blockDim.x; + uint32_t hashA[8], hashB[8]; - uint32_t m[3]; - uint32_t v[16]; + sph_blake256_context ctx_blake; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + sph_groestl256_context ctx_groestl; + sph_cubehash256_context ctx_cube; - m[0] = midstate[16]; - m[1] = midstate[17]; - m[2] = midstate[18]; + sph_blake256_set_rounds(14); - const uint32_t h7 = midstate[19]; - - for (uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; thread data; uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; - int dev_id = device_map[thr_id]; - int rc = 0; - - // blake2s - uint32_t *resNonces; - if (opt_benchmark) - ptarget[7] = 0x006fff; - - const dim3 grid((throughput + (NPT*TPB) - 1) / (NPT*TPB)); - const dim3 block(TPB); + ptarget[7] = 0x00ff; static __thread bool gtx750ti; if (!init[thr_id]) { + int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); - if (opt_cudaschedule == -1 && gpu_threads == 1) { - cudaDeviceReset(); - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - CUDA_LOG_ERROR(); - } - - cuda_get_arch(thr_id); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1); - CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1); + CUDA_LOG_ERROR(); - int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; - if (device_sm[dev_id] <= 500) intensity = 15; + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; + if (device_sm[device_map[thr_id]] == 500) intensity = 15; throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); cudaDeviceProp props; cudaGetDeviceProperties(&props, dev_id); - gtx750ti = (strstr(props.name, "750 Ti") != NULL); + + if (strstr(props.name, "750 Ti")) gtx750ti = true; + else gtx750ti = false; gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + blake256_cpu_init(thr_id, throughput); + //keccak256_sm3_init(thr_id, throughput); + skein256_cpu_init(thr_id, throughput); + groestl256_cpu_init(thr_id, throughput); + + //cuda_get_arch(thr_id); if (device_sm[dev_id] >= 500) { size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; @@ -447,92 +123,90 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); - // nonce - cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); - init[thr_id] = true; } - resNonces = h_resNonce[thr_id]; - for (int k = 0; k < 19; k++) + uint32_t _ALIGN(128) endiandata[20]; + for (int k = 0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - allium_blake2s_setBlock(endiandata, ptarget[7]); - - cudaMemset(d_resNonce[thr_id], 0x00, maxResults*sizeof(uint32_t)); - uint32_t _ALIGN(64) hash[8]; + blake256_cpu_setBlock_80(pdata); + groestl256_setTarget(ptarget); do { - //be32enc(&endiandata[19], nonce); + int order = 0; - if (ptarget[7]) { - allium_blake2s_gpu_hash_nonce<<>>(throughput, pdata[19], d_resNonce[thr_id], ptarget[7]); - } - else { - allium_blake2s_gpu_hash_nonce<<>>(throughput, pdata[19], d_resNonce[thr_id]); - } + blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - *hashes_done = pdata[19] - first_nonce + throughput; - - cudaMemcpy(&d_hash[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyHostToHost); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); - cudaMemcpy(h_GNonces[thr_id], d_hash[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost); - work->nonces[0] = *h_GNonces[thr_id]; - if (work->nonces[0]) + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + if (work->nonces[0] != UINT32_MAX) { - //gpulog(LOG_INFO, thr_id, "Running on nonce %u", work->nonces[0]); - //cudaMemcpy(resNonces, d_hash[thr_id], maxResults*sizeof(uint32_t), cudaMemcpyDeviceToHost); - //cudaMemset(d_hash[thr_id], 0x00, sizeof(uint32_t)); - - //if (resNonces[0] >= maxResults) { - // gpulog(LOG_WARNING, thr_id, "candidates flood: %u", resNonces[0]); - // resNonces[0] = maxResults - 1; - //} - - pdata[19] = work->nonces[0]; - be32enc(&endiandata[19], pdata[19]); - allium_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - gpulog(LOG_INFO, thr_id, "Found valid nonce"); - //work->nonces[0] = pdata[19]; + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], work->nonces[0]); + allium_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; - work_set_target_ratio(work, hash); - *hashes_done = pdata[19] - first_nonce; + work_set_target_ratio(work, vhash); + work->nonces[1] = groestl256_getSecNonce(thr_id, 1); + if (work->nonces[1] != UINT32_MAX) { + be32enc(&endiandata[19], work->nonces[1]); + allium_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } + else { + pdata[19] = work->nonces[0] + 1; // cursor + } return work->valid_nonces; } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } } - if (pdata[19] + throughput > max_nonce) { + if ((uint64_t)throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } - pdata[19] += throughput; - } while (!work_restart[thr_id].restart); - //pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; + } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; return 0; } // cleanup extern "C" void free_allium(int thr_id) { - int dev_id = device_map[thr_id]; if (!init[thr_id]) return; cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); - if (device_sm[dev_id] >= 350) - cudaFree(d_matrix[thr_id]); - // nonce - cudaFreeHost(h_GNonces[thr_id]); + cudaFree(d_matrix[thr_id]); + + //keccak256_sm3_free(thr_id); + groestl256_cpu_free(thr_id); init[thr_id] = false; From 81931e3272b681f81c0404120b8af53f7e1a72ca Mon Sep 17 00:00:00 2001 From: Lennart ten Wolde Date: Sun, 11 Feb 2018 17:49:45 +0100 Subject: [PATCH 4/4] Remove unused code --- allium.cu | 2 +- allium0.cu | 298 ----------------------------------------------------- 2 files changed, 1 insertion(+), 299 deletions(-) delete mode 100644 allium0.cu diff --git a/allium.cu b/allium.cu index 5cad1d7975..a3eeb722f4 100644 --- a/allium.cu +++ b/allium.cu @@ -86,7 +86,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce const uint32_t first_nonce = pdata[19]; if (opt_benchmark) - ptarget[7] = 0x00ff; + ptarget[7] = 0x0400; static __thread bool gtx750ti; if (!init[thr_id]) diff --git a/allium0.cu b/allium0.cu deleted file mode 100644 index 87cbdd0a32..0000000000 --- a/allium0.cu +++ /dev/null @@ -1,298 +0,0 @@ -extern "C" { -#include "sph/sph_blake.h" -#include "sph/sph_groestl.h" -#include "sph/sph_skein.h" -#include "sph/sph_keccak.h" -#include "lyra2/Lyra2.h" -#include "sph/blake2s.h" -} - -#include -#include - -static uint64_t* d_hash[MAX_GPUS]; -static uint64_t* d_matrix[MAX_GPUS]; - -//extern void blake256_cpu_init(int thr_id, uint32_t threads); -//extern void blake256_cpu_setBlock_80(uint32_t *pdata); -//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); - -//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); -//extern void keccak256_sm3_init(int thr_id, uint32_t threads); -//extern void keccak256_sm3_free(int thr_id); - -//extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -// -//extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); -//extern void skein256_cpu_init(int thr_id, uint32_t threads); - -extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); - -//extern void groestl256_cpu_init(int thr_id, uint32_t threads); -//extern void groestl256_cpu_free(int thr_id); -//extern void groestl256_setTarget(const void *ptarget); -//extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); -//extern uint32_t groestl256_getSecNonce(int thr_id, int num); -__constant__ uint32_t _ALIGN(32) midstate[20]; - -extern "C" void allium_hash(void *state, const void *input) -{ - uint32_t hashB[8]; - uint32_t hashA[8]; - - //sph_blake256_context ctx_blake; - //sph_keccak256_context ctx_keccak; - //sph_skein256_context ctx_skein; - //sph_groestl256_context ctx_groestl; - - //sph_blake256_set_rounds(14); - - //sph_blake256_init(&ctx_blake); - //sph_blake256(&ctx_blake, input, 80); - //sph_blake256_close(&ctx_blake, hashA); - - //sph_keccak256_init(&ctx_keccak); - //sph_keccak256(&ctx_keccak, hashA, 32); - //sph_keccak256_close(&ctx_keccak, hashB); - - LYRA2(hashB, 32, input, 32, input, 32, 1, 8, 8); - blake2s_hash(hashA, hashB); - - //blake2s(out, in, NULL, 32, inlen, 0) - - //blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) - //blake2s_simple(hashA, hashB, 32); - //blake2s_hash(hashA, hashB); - - //sph_skein256_init(&ctx_skein); - //sph_skein256(&ctx_skein, hashA, 32); - //sph_skein256_close(&ctx_skein, hashB); - - //sph_groestl256_init(&ctx_groestl); - //sph_groestl256(&ctx_groestl, hashB, 32); - //sph_groestl256_close(&ctx_groestl, hashA); - - memcpy(state, hashA, 32); -} - -static void blake2s_setBlock(const uint32_t* input, const uint32_t ptarget7) -{ - uint32_t _ALIGN(64) m[16]; - uint32_t _ALIGN(64) v[16]; - uint32_t _ALIGN(64) h[21]; - - // COMPRESS - for (int i = 0; i < 16; ++i) - m[i] = input[i]; - - h[0] = 0x01010020 ^ blake2s_IV[0]; - h[1] = blake2s_IV[1]; - h[2] = blake2s_IV[2]; h[3] = blake2s_IV[3]; - h[4] = blake2s_IV[4]; h[5] = blake2s_IV[5]; - h[6] = blake2s_IV[6]; h[7] = blake2s_IV[7]; - - for (int i = 0; i < 8; ++i) - v[i] = h[i]; - - v[8] = blake2s_IV[0]; v[9] = blake2s_IV[1]; - v[10] = blake2s_IV[2]; v[11] = blake2s_IV[3]; - v[12] = 64 ^ blake2s_IV[4]; v[13] = blake2s_IV[5]; - v[14] = blake2s_IV[6]; v[15] = blake2s_IV[7]; - - ROUND(0); ROUND(1); - ROUND(2); ROUND(3); - ROUND(4); ROUND(5); - ROUND(6); ROUND(7); - ROUND(8); ROUND(9); - - for (int i = 0; i < 8; ++i) - h[i] ^= v[i] ^ v[i + 8]; - - h[16] = input[16]; - h[17] = input[17]; - h[18] = input[18]; - - h[8] = 0x6A09E667; h[9] = 0xBB67AE85; - h[10] = 0x3C6EF372; h[11] = 0xA54FF53A; - h[12] = 0x510E522F; h[13] = 0x9B05688C; - h[14] = ~0x1F83D9AB; h[15] = 0x5BE0CD19; - - h[0] += h[4] + h[16]; - h[12] = SPH_ROTR32(h[12] ^ h[0], 16); - h[8] += h[12]; - h[4] = SPH_ROTR32(h[4] ^ h[8], 12); - h[0] += h[4] + h[17]; - h[12] = SPH_ROTR32(h[12] ^ h[0], 8); - h[8] += h[12]; - h[4] = SPH_ROTR32(h[4] ^ h[8], 7); - - h[1] += h[5] + h[18]; - h[13] = SPH_ROTR32(h[13] ^ h[1], 16); - h[9] += h[13]; - h[5] = ROTR32(h[5] ^ h[9], 12); - - h[2] += h[6]; - h[14] = SPH_ROTR32(h[14] ^ h[2], 16); - h[10] += h[14]; - h[6] = SPH_ROTR32(h[6] ^ h[10], 12); - h[2] += h[6]; - h[14] = SPH_ROTR32(h[14] ^ h[2], 8); - h[10] += h[14]; - h[6] = SPH_ROTR32(h[6] ^ h[10], 7); - - h[19] = h[7]; //constant h[7] for nonce check - - h[3] += h[7]; - h[15] = SPH_ROTR32(h[15] ^ h[3], 16); - h[11] += h[15]; - h[7] = SPH_ROTR32(h[7] ^ h[11], 12); - h[3] += h[7]; - h[15] = SPH_ROTR32(h[15] ^ h[3], 8); - h[11] += h[15]; - h[7] = SPH_ROTR32(h[7] ^ h[11], 7); - - h[1] += h[5]; - h[3] += h[4]; - h[14] = SPH_ROTR32(h[14] ^ h[3], 16); - - h[2] += h[7]; - if (ptarget7 == 0){ - h[19] = SPH_ROTL32(h[19], 7); //align the rotation with v[7] v[15]; - } - cudaMemcpyToSymbol(midstate, h, 20 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); -} - -static bool init[MAX_GPUS] = { 0 }; -static __thread uint32_t throughput = 0; - -extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - - if (opt_benchmark) - ptarget[7] = 0x00ff; - - static __thread bool gtx750ti; - if (!init[thr_id]) - { - int dev_id = device_map[thr_id]; - cudaSetDevice(dev_id); - CUDA_LOG_ERROR(); - - int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; - if (device_sm[device_map[thr_id]] == 500) intensity = 15; - throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - - cudaDeviceProp props; - cudaGetDeviceProperties(&props, dev_id); - - if (strstr(props.name, "750 Ti")) gtx750ti = true; - else gtx750ti = false; - - gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - - //blake256_cpu_init(thr_id, throughput); - //keccak256_sm3_init(thr_id, throughput); - //skein256_cpu_init(thr_id, throughput); - //groestl256_cpu_init(thr_id, throughput); - - //cuda_get_arch(thr_id); - if (device_sm[dev_id] >= 500) - { - size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; - CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); - lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); - } - - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); - - init[thr_id] = true; - } - - uint32_t _ALIGN(128) endiandata[20]; - for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], pdata[k]); - - //blake256_cpu_setBlock_80(pdata); - //groestl256_setTarget(ptarget); - - do { - int order = 0; - - //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - //blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); - //skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - *hashes_done = pdata[19] - first_nonce + throughput; - - //work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - work->nonces[0] = lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); - if (work->nonces[0] != UINT32_MAX) - { - const uint32_t Htarg = ptarget[7]; - uint32_t _ALIGN(64) vhash[8]; - - be32enc(&endiandata[19], work->nonces[0]); - allium_hash(vhash, endiandata); - - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { - work->valid_nonces = 1; - work_set_target_ratio(work, vhash); - //work->nonces[1] = groestl256_getSecNonce(thr_id, 1); - if (work->nonces[1] != UINT32_MAX) { - be32enc(&endiandata[19], work->nonces[1]); - allium_hash(vhash, endiandata); - bn_set_target_ratio(work, vhash, 1); - work->valid_nonces++; - pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; - } - else { - pdata[19] = work->nonces[0] + 1; // cursor - } - return work->valid_nonces; - } - else if (vhash[7] > Htarg) { - gpu_increment_reject(thr_id); - if (!opt_quiet) - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); - pdata[19] = work->nonces[0] + 1; - continue; - } - } - - if ((uint64_t)throughput + pdata[19] >= max_nonce) { - pdata[19] = max_nonce; - break; - } - pdata[19] += throughput; - - } while (!work_restart[thr_id].restart); - - *hashes_done = pdata[19] - first_nonce; - return 0; -} - -// cleanup -extern "C" void free_allium(int thr_id) -{ - if (!init[thr_id]) - return; - - cudaThreadSynchronize(); - - cudaFree(d_hash[thr_id]); - cudaFree(d_matrix[thr_id]); - - //keccak256_sm3_free(thr_id); - groestl256_cpu_free(thr_id); - - init[thr_id] = false; - - cudaDeviceSynchronize(); -}