// Blake2-B CUDA Implementation // tpruvot@github July 2016 // permission granted to use under MIT license // modified for use in Zcash by John Tromp September 2016 /** * uint2 direct ops by c++ operator definitions */ static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } // uint2 ROR/ROL methods __device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) { uint2 result; #if __CUDA_ARCH__ > 300 if (offset < 32) { asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); } else /* if (offset < 64) */ { /* offset SHOULD BE < 64 ! */ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); } #else if (!offset) result = a; else if (offset < 32) { result.y = ((a.y >> offset) | (a.x << (32 - offset))); result.x = ((a.x >> offset) | (a.y << (32 - offset))); } else if (offset == 32) { result.y = a.x; result.x = a.y; } else { result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); } #endif return result; } __device__ __forceinline__ uint2 SWAPUINT2(uint2 value) { return make_uint2(value.y, value.x); } #ifdef __CUDA_ARCH__ __device__ __inline__ uint2 ROR24(const uint2 a) { uint2 result; result.x = __byte_perm(a.y, a.x, 0x2107); result.y = __byte_perm(a.y, a.x, 0x6543); return result; } __device__ __inline__ uint2 ROR16(const uint2 a) { uint2 result; result.x = __byte_perm(a.y, a.x, 0x1076); result.y = __byte_perm(a.y, a.x, 0x5432); return result; } #else #define ROR24(u) ROR2(u,24) #define ROR16(u) ROR2(u,16) #endif typedef uint64_t u64; static __constant__ const int8_t blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; __device__ __forceinline__ static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) { a = a + b + m[ blake2b_sigma[r][2*i] ]; ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); c = c + d; ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); a = a + b + m[ blake2b_sigma[r][2*i+1] ]; ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); c = c + d; ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); } #define ROUND(r) \ G(r, 0, v[0], v[4], v[ 8], v[12], m); \ G(r, 1, v[1], v[5], v[ 9], v[13], m); \ G(r, 2, v[2], v[6], v[10], v[14], m); \ G(r, 3, v[3], v[7], v[11], v[15], m); \ G(r, 4, v[0], v[5], v[10], v[15], m); \ G(r, 5, v[1], v[6], v[11], v[12], m); \ G(r, 6, v[2], v[7], v[ 8], v[13], m); \ G(r, 7, v[3], v[4], v[ 9], v[14], m); __device__ void blake2b_gpu_hash(blake2b_state *state, u32 idx, uchar *hash, u32 outlen) { const u32 leb = idx; // CUDA is little endian, so no need for htole32(idx) memcpy(state->buf + state->buflen, &leb, sizeof(u32)); state->buflen += sizeof(u32); state->counter += state->buflen; memset(state->buf + state->buflen, 0, BLAKE2B_BLOCKBYTES - state->buflen); u64 *d_data = (u64 *)state->buf; u64 m[16]; m[0] = d_data[0]; m[1] = d_data[1]; m[2] = d_data[2]; m[3] = d_data[3]; m[4] = d_data[4]; m[5] = d_data[5]; m[6] = d_data[6]; m[7] = d_data[7]; m[8] = d_data[8]; m[9] = d_data[9]; m[10] = d_data[10]; m[11] = d_data[11]; m[12] = d_data[12]; m[13] = d_data[13]; m[14] = d_data[14]; m[15] = d_data[15]; u64 v[16]; v[0] = state->h[0]; v[1] = state->h[1]; v[2] = state->h[2]; v[3] = state->h[3]; v[4] = state->h[4]; v[5] = state->h[5]; v[6] = state->h[6]; v[7] = state->h[7]; v[8] = 0x6a09e667f3bcc908; v[9] = 0xbb67ae8584caa73b; v[10] = 0x3c6ef372fe94f82b; v[11] = 0xa54ff53a5f1d36f1; v[12] = 0x510e527fade682d1 ^ state->counter; v[13] = 0x9b05688c2b3e6c1f; v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff; v[15] = 0x5be0cd19137e2179; ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); ROUND( 10 ); ROUND( 11 ); state->h[0] ^= v[0] ^ v[ 8]; state->h[1] ^= v[1] ^ v[ 9]; state->h[2] ^= v[2] ^ v[10]; state->h[3] ^= v[3] ^ v[11]; state->h[4] ^= v[4] ^ v[12]; state->h[5] ^= v[5] ^ v[13]; state->h[6] ^= v[6] ^ v[14]; state->h[7] ^= v[7] ^ v[15]; memcpy(hash, (uchar *)state->h, outlen); }