// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016

/**
 * uint2 direct ops by c++ operator definitions
 */
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
  return make_uint2(a.x ^ b.x, a.y ^ b.y);
}
// uint2 ROR/ROL methods
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
  uint2 result;
#if __CUDA_ARCH__ > 300
  if (offset < 32) {
          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
  } else /* if (offset < 64) */ {
          /* offset SHOULD BE < 64 ! */
          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
  }
#else
  if (!offset)
          result = a;
  else if (offset < 32) {
          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
  } else if (offset == 32) {
          result.y = a.x;
          result.x = a.y;
  } else {
          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
  }
#endif
  return result;
}
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
  return make_uint2(value.y, value.x);
}
#ifdef __CUDA_ARCH__
__device__ __inline__ uint2 ROR24(const uint2 a) {
  uint2 result;
  result.x = __byte_perm(a.y, a.x, 0x2107);
  result.y = __byte_perm(a.y, a.x, 0x6543);
  return result;
}
__device__ __inline__ uint2 ROR16(const uint2 a) {
  uint2 result;
  result.x = __byte_perm(a.y, a.x, 0x1076);
  result.y = __byte_perm(a.y, a.x, 0x5432);
  return result;
}
#else
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
#endif

typedef uint64_t u64;

static __constant__ const int8_t blake2b_sigma[12][16] = {
  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
};

__device__ __forceinline__
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
  a = a + b + m[ blake2b_sigma[r][2*i] ];
  ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
  c = c + d;
  ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
  a = a + b + m[ blake2b_sigma[r][2*i+1] ];
  ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
  c = c + d;
  ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}

#define ROUND(r) \
  G(r, 0, v[0], v[4], v[ 8], v[12], m); \
  G(r, 1, v[1], v[5], v[ 9], v[13], m); \
  G(r, 2, v[2], v[6], v[10], v[14], m); \
  G(r, 3, v[3], v[7], v[11], v[15], m); \
  G(r, 4, v[0], v[5], v[10], v[15], m); \
  G(r, 5, v[1], v[6], v[11], v[12], m); \
  G(r, 6, v[2], v[7], v[ 8], v[13], m); \
  G(r, 7, v[3], v[4], v[ 9], v[14], m);

__device__ void blake2b_gpu_hash(blake2b_state *state, u32 idx, uchar *hash, u32 outlen) {
  const u32 leb = idx; // CUDA is little endian, so no need for htole32(idx)
  memcpy(state->buf + state->buflen, &leb, sizeof(u32));
  state->buflen += sizeof(u32);
  state->counter += state->buflen;
  memset(state->buf + state->buflen, 0, BLAKE2B_BLOCKBYTES - state->buflen);

  u64 *d_data = (u64 *)state->buf;
  u64 m[16];

  m[0] = d_data[0];
  m[1] = d_data[1];
  m[2] = d_data[2];
  m[3] = d_data[3];
  m[4] = d_data[4];
  m[5] = d_data[5];
  m[6] = d_data[6];
  m[7] = d_data[7];
  m[8] = d_data[8];
  m[9] = d_data[9];
  m[10] = d_data[10];
  m[11] = d_data[11];
  m[12] = d_data[12];
  m[13] = d_data[13];
  m[14] = d_data[14];
  m[15] = d_data[15];

  u64 v[16];

  v[0] = state->h[0];
  v[1] = state->h[1];
  v[2] = state->h[2];
  v[3] = state->h[3];
  v[4] = state->h[4];
  v[5] = state->h[5];
  v[6] = state->h[6];
  v[7] = state->h[7];
  v[8] = 0x6a09e667f3bcc908;
  v[9] = 0xbb67ae8584caa73b;
  v[10] =  0x3c6ef372fe94f82b;
  v[11] = 0xa54ff53a5f1d36f1;
  v[12] = 0x510e527fade682d1 ^ state->counter;
  v[13] = 0x9b05688c2b3e6c1f;
  v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
  v[15] = 0x5be0cd19137e2179;

  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  ROUND( 3 );
  ROUND( 4 );
  ROUND( 5 );
  ROUND( 6 );
  ROUND( 7 );
  ROUND( 8 );
  ROUND( 9 );
  ROUND( 10 );
  ROUND( 11 );
  
  state->h[0] ^= v[0] ^ v[ 8];
  state->h[1] ^= v[1] ^ v[ 9];
  state->h[2] ^= v[2] ^ v[10];
  state->h[3] ^= v[3] ^ v[11];
  state->h[4] ^= v[4] ^ v[12];
  state->h[5] ^= v[5] ^ v[13];
  state->h[6] ^= v[6] ^ v[14];
  state->h[7] ^= v[7] ^ v[15];
  memcpy(hash, (uchar *)state->h, outlen);
}