2016-10-14 11:28:34 -07:00
|
|
|
// Blake2-B CUDA Implementation
|
|
|
|
// tpruvot@github July 2016
|
2016-10-16 15:49:03 -07:00
|
|
|
// permission granted to use under MIT license
|
|
|
|
// modified for use in Zcash by John Tromp September 2016
|
2016-10-14 11:28:34 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* uint2 direct ops by c++ operator definitions
|
|
|
|
*/
|
|
|
|
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
|
|
|
|
return make_uint2(a.x ^ b.x, a.y ^ b.y);
|
|
|
|
}
|
|
|
|
// uint2 ROR/ROL methods
|
|
|
|
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
|
|
|
|
uint2 result;
|
|
|
|
#if __CUDA_ARCH__ > 300
|
|
|
|
if (offset < 32) {
|
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
|
|
|
} else /* if (offset < 64) */ {
|
|
|
|
/* offset SHOULD BE < 64 ! */
|
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
if (!offset)
|
|
|
|
result = a;
|
|
|
|
else if (offset < 32) {
|
|
|
|
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
|
|
|
|
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
|
|
|
|
} else if (offset == 32) {
|
|
|
|
result.y = a.x;
|
|
|
|
result.x = a.y;
|
|
|
|
} else {
|
|
|
|
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
|
|
|
|
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
|
|
|
|
return make_uint2(value.y, value.x);
|
|
|
|
}
|
|
|
|
#ifdef __CUDA_ARCH__
|
|
|
|
__device__ __inline__ uint2 ROR24(const uint2 a) {
|
|
|
|
uint2 result;
|
|
|
|
result.x = __byte_perm(a.y, a.x, 0x2107);
|
|
|
|
result.y = __byte_perm(a.y, a.x, 0x6543);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
__device__ __inline__ uint2 ROR16(const uint2 a) {
|
|
|
|
uint2 result;
|
|
|
|
result.x = __byte_perm(a.y, a.x, 0x1076);
|
|
|
|
result.y = __byte_perm(a.y, a.x, 0x5432);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define ROR24(u) ROR2(u,24)
|
|
|
|
#define ROR16(u) ROR2(u,16)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
typedef uint64_t u64;
|
|
|
|
|
|
|
|
static __constant__ const int8_t blake2b_sigma[12][16] = {
|
|
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
|
|
|
|
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
|
|
|
|
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
|
|
|
|
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
|
|
|
|
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
|
|
|
|
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
|
|
|
|
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
|
|
|
|
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
|
|
|
|
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
|
|
|
|
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
|
|
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
|
|
|
|
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
|
|
|
};
|
|
|
|
|
|
|
|
__device__ __forceinline__
|
|
|
|
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
|
|
|
|
a = a + b + m[ blake2b_sigma[r][2*i] ];
|
|
|
|
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
|
|
|
|
c = c + d;
|
|
|
|
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
|
|
|
|
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
|
|
|
|
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
|
|
|
|
c = c + d;
|
|
|
|
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define ROUND(r) \
|
|
|
|
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
|
|
|
|
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
|
|
|
|
G(r, 2, v[2], v[6], v[10], v[14], m); \
|
|
|
|
G(r, 3, v[3], v[7], v[11], v[15], m); \
|
|
|
|
G(r, 4, v[0], v[5], v[10], v[15], m); \
|
|
|
|
G(r, 5, v[1], v[6], v[11], v[12], m); \
|
|
|
|
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
|
|
|
|
G(r, 7, v[3], v[4], v[ 9], v[14], m);
|
|
|
|
|
|
|
|
__device__ void blake2b_gpu_hash(blake2b_state *state, u32 idx, uchar *hash, u32 outlen) {
|
|
|
|
const u32 leb = htole32(idx);
|
|
|
|
memcpy(state->buf + state->buflen, &leb, sizeof(u32));
|
|
|
|
state->buflen += sizeof(u32);
|
|
|
|
state->counter += state->buflen;
|
|
|
|
memset(state->buf + state->buflen, 0, BLAKE2B_BLOCKBYTES - state->buflen);
|
|
|
|
|
|
|
|
u64 *d_data = (u64 *)state->buf;
|
|
|
|
u64 m[16];
|
|
|
|
|
|
|
|
m[0] = d_data[0];
|
|
|
|
m[1] = d_data[1];
|
|
|
|
m[2] = d_data[2];
|
|
|
|
m[3] = d_data[3];
|
|
|
|
m[4] = d_data[4];
|
|
|
|
m[5] = d_data[5];
|
|
|
|
m[6] = d_data[6];
|
|
|
|
m[7] = d_data[7];
|
|
|
|
m[8] = d_data[8];
|
|
|
|
m[9] = d_data[9];
|
|
|
|
m[10] = d_data[10];
|
|
|
|
m[11] = d_data[11];
|
|
|
|
m[12] = d_data[12];
|
|
|
|
m[13] = d_data[13];
|
|
|
|
m[14] = d_data[14];
|
|
|
|
m[15] = d_data[15];
|
|
|
|
|
|
|
|
u64 v[16];
|
|
|
|
|
|
|
|
v[0] = state->h[0];
|
|
|
|
v[1] = state->h[1];
|
|
|
|
v[2] = state->h[2];
|
|
|
|
v[3] = state->h[3];
|
|
|
|
v[4] = state->h[4];
|
|
|
|
v[5] = state->h[5];
|
|
|
|
v[6] = state->h[6];
|
|
|
|
v[7] = state->h[7];
|
|
|
|
v[8] = 0x6a09e667f3bcc908;
|
|
|
|
v[9] = 0xbb67ae8584caa73b;
|
|
|
|
v[10] = 0x3c6ef372fe94f82b;
|
|
|
|
v[11] = 0xa54ff53a5f1d36f1;
|
|
|
|
v[12] = 0x510e527fade682d1 ^ state->counter;
|
|
|
|
v[13] = 0x9b05688c2b3e6c1f;
|
|
|
|
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
|
|
|
|
v[15] = 0x5be0cd19137e2179;
|
|
|
|
|
|
|
|
ROUND( 0 );
|
|
|
|
ROUND( 1 );
|
|
|
|
ROUND( 2 );
|
|
|
|
ROUND( 3 );
|
|
|
|
ROUND( 4 );
|
|
|
|
ROUND( 5 );
|
|
|
|
ROUND( 6 );
|
|
|
|
ROUND( 7 );
|
|
|
|
ROUND( 8 );
|
|
|
|
ROUND( 9 );
|
|
|
|
ROUND( 10 );
|
|
|
|
ROUND( 11 );
|
|
|
|
|
|
|
|
state->h[0] ^= v[0] ^ v[ 8];
|
|
|
|
state->h[1] ^= v[1] ^ v[ 9];
|
|
|
|
state->h[2] ^= v[2] ^ v[10];
|
|
|
|
state->h[3] ^= v[3] ^ v[11];
|
|
|
|
state->h[4] ^= v[4] ^ v[12];
|
|
|
|
state->h[5] ^= v[5] ^ v[13];
|
|
|
|
state->h[6] ^= v[6] ^ v[14];
|
|
|
|
state->h[7] ^= v[7] ^ v[15];
|
|
|
|
memcpy(hash, (uchar *)state->h, outlen);
|
|
|
|
}
|