drop OpenCL solvers

This commit is contained in:
Stanko Krstić 2017-01-12 10:52:17 +01:00
parent 2a6eb0516c
commit 340064b9c6
43 changed files with 2 additions and 21826 deletions

Binary file not shown.

View File

@ -1,150 +0,0 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions
*/
// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
// return make_uint2(a.x ^ b.x, a.y ^ b.y);
// }
// uint2 ROR/ROL methods
uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
return result;
}
uint2 SWAPUINT2(uint2 value) {
uint2 result;
result.x = value.y;
result.y = value.x;
return result;
// return make_uint2(value.y, value.x);
}
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
__constant int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
*a += *b + m[ blake2b_sigma[r][2*i] ];
((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
*a += *b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
const uint32_t leb = idx;
*(uint32_t*)(state->buf + state->buflen) = leb;
state->buflen += 4;
state->counter += state->buflen;
for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
state->buf[i+state->buflen] = 0;
uint64_t *d_data = (uint64_t *)state->buf;
uint64_t m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
m[9] = d_data[9];
m[10] = d_data[10];
m[11] = d_data[11];
m[12] = d_data[12];
m[13] = d_data[13];
m[14] = d_data[14];
m[15] = d_data[15];
uint64_t v[16];
v[0] = state->h[0];
v[1] = state->h[1];
v[2] = state->h[2];
v[3] = state->h[3];
v[4] = state->h[4];
v[5] = state->h[5];
v[6] = state->h[6];
v[7] = state->h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ state->counter;
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
state->h[0] ^= v[0] ^ v[ 8];
state->h[1] ^= v[1] ^ v[ 9];
state->h[2] ^= v[2] ^ v[10];
state->h[3] ^= v[3] ^ v[11];
state->h[4] ^= v[4] ^ v[12];
state->h[5] ^= v[5] ^ v[13];
state->h[6] ^= v[6] ^ v[14];
state->h[7] ^= v[7] ^ v[15];
for (unsigned i = 0; i < outlen; i++)
hash[i] = ((uint8_t*)state->h)[i];
}

View File

@ -1,156 +0,0 @@
#if defined(__OPENCL_HOST__)
#define __global
#include "../blake2.h"
#else
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
typedef long int64_t;
typedef ulong uint64_t;
#if defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
ALIGN( 64 ) typedef struct __blake2b_state {
uint64_t h[8];
uint8_t buf[BLAKE2B_BLOCKBYTES];
uint16_t counter;
uint8_t buflen;
uint8_t lastblock;
} blake2b_state;
#pragma pack(pop)
#endif
#define COLLISION_BIT_LENGTH (WN / (WK+1))
#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
#define NDIGITS (WK+1)
#define DIGITBITS (WN/(NDIGITS))
#define PROOFSIZE (1u<<WK)
#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
#define BASE (1u<<DIGITBITS)
#define NHASHES (2u*BASE)
#define HASHESPERBLAKE (512/WN)
#define HASHOUT (HASHESPERBLAKE*WN/8)
// 2_log of number of buckets
#define BUCKBITS (DIGITBITS-RESTBITS)
// number of buckets
#define NBUCKETS (1<<BUCKBITS)
// 2_log of number of slots per bucket
#define SLOTBITS (RESTBITS+1+1)
// number of slots per bucket
#define NSLOTS (1u<<SLOTBITS)
// number of per-xhash slots
#define XFULL 16
// SLOTBITS mask
#define SLOTMASK (NSLOTS-1)
// number of possible values of xhash (rest of n) bits
#define NRESTS (1u<<RESTBITS)
// number of blocks of hashes extracted from single 512 bit blake2b output
#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
// nothing larger found in 100000 runs
#define MAXSOLS 8
#define WORDS(bits) ((bits + 31) / 32)
#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
typedef uint32_t proof[PROOFSIZE];
// tree = | xhash(RESTBITS) | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
typedef uint32_t tree;
typedef union hashunit {
uint32_t word;
uint8_t bytes[4];
} hashunit;
typedef struct slot0 {
tree attr;
hashunit hash[HASHWORDS0];
} slot0;
typedef struct slot1 {
tree attr;
hashunit hash[HASHWORDS1];
} slot1;
// a bucket is NSLOTS treenodes
typedef slot0 bucket0[NSLOTS];
typedef slot1 bucket1[NSLOTS];
// the N-bit hash consists of K+1 n-bit "digits"
// each of which corresponds to a layer of NBUCKETS buckets
typedef bucket0 digit0[NBUCKETS];
typedef bucket1 digit1[NBUCKETS];
// manages hash and tree data
typedef struct htalloc {
__global bucket0 *trees0[(WK+1)/2];
__global bucket1 *trees1[WK/2];
} htalloc;
typedef uint32_t bsizes[NBUCKETS];
typedef struct htlayout {
htalloc hta;
uint32_t prevhashunits;
uint32_t nexthashunits;
uint32_t dunits;
uint32_t prevbo;
uint32_t nextbo;
} htlayout;
#if RESTBITS <= 6
typedef uint8_t xslot;
#else
typedef uint16_t xslot;
#endif
typedef struct collisiondata {
#ifdef XBITMAP
#if NSLOTS > 64
#error cant use XBITMAP with more than 64 slots
#endif
uint64_t xhashmap[NRESTS];
uint64_t xmap;
#else
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
uint32_t n0;
uint32_t n1;
#endif
uint32_t s0;
} collisiondata;
typedef struct equi {
blake2b_state blake_ctx;
htalloc hta;
__global bsizes *nslots;
__global proof *sols;
uint32_t nsols;
uint32_t nthreads;
} equi;

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,150 +0,0 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions
*/
// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
// return make_uint2(a.x ^ b.x, a.y ^ b.y);
// }
// uint2 ROR/ROL methods
uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
return result;
}
uint2 SWAPUINT2(uint2 value) {
uint2 result;
result.x = value.y;
result.y = value.x;
return result;
// return make_uint2(value.y, value.x);
}
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
__constant int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
*a += *b + m[ blake2b_sigma[r][2*i] ];
((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
*a += *b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
const uint32_t leb = idx;
*(uint32_t*)(state->buf + state->buflen) = leb;
state->buflen += 4;
state->counter += state->buflen;
for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
state->buf[i+state->buflen] = 0;
uint64_t *d_data = (uint64_t *)state->buf;
uint64_t m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
m[9] = d_data[9];
m[10] = d_data[10];
m[11] = d_data[11];
m[12] = d_data[12];
m[13] = d_data[13];
m[14] = d_data[14];
m[15] = d_data[15];
uint64_t v[16];
v[0] = state->h[0];
v[1] = state->h[1];
v[2] = state->h[2];
v[3] = state->h[3];
v[4] = state->h[4];
v[5] = state->h[5];
v[6] = state->h[6];
v[7] = state->h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ state->counter;
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
state->h[0] ^= v[0] ^ v[ 8];
state->h[1] ^= v[1] ^ v[ 9];
state->h[2] ^= v[2] ^ v[10];
state->h[3] ^= v[3] ^ v[11];
state->h[4] ^= v[4] ^ v[12];
state->h[5] ^= v[5] ^ v[13];
state->h[6] ^= v[6] ^ v[14];
state->h[7] ^= v[7] ^ v[15];
for (unsigned i = 0; i < outlen; i++)
hash[i] = ((uint8_t*)state->h)[i];
}

View File

@ -1,156 +0,0 @@
#if defined(__OPENCL_HOST__)
#define __global
#include "../blake2.h"
#else
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
typedef long int64_t;
typedef ulong uint64_t;
#if defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
ALIGN( 64 ) typedef struct __blake2b_state {
uint64_t h[8];
uint8_t buf[BLAKE2B_BLOCKBYTES];
uint16_t counter;
uint8_t buflen;
uint8_t lastblock;
} blake2b_state;
#pragma pack(pop)
#endif
#define COLLISION_BIT_LENGTH (WN / (WK+1))
#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
#define NDIGITS (WK+1)
#define DIGITBITS (WN/(NDIGITS))
#define PROOFSIZE (1u<<WK)
#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
#define BASE (1u<<DIGITBITS)
#define NHASHES (2u*BASE)
#define HASHESPERBLAKE (512/WN)
#define HASHOUT (HASHESPERBLAKE*WN/8)
// 2_log of number of buckets
#define BUCKBITS (DIGITBITS-RESTBITS)
// number of buckets
#define NBUCKETS (1<<BUCKBITS)
// 2_log of number of slots per bucket
#define SLOTBITS (RESTBITS+1+1)
// number of slots per bucket
#define NSLOTS (1u<<SLOTBITS)
// number of per-xhash slots
#define XFULL 16
// SLOTBITS mask
#define SLOTMASK (NSLOTS-1)
// number of possible values of xhash (rest of n) bits
#define NRESTS (1u<<RESTBITS)
// number of blocks of hashes extracted from single 512 bit blake2b output
#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
// nothing larger found in 100000 runs
#define MAXSOLS 8
#define WORDS(bits) ((bits + 31) / 32)
#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
typedef uint32_t proof[PROOFSIZE];
// tree = | xhash(RESTBITS) | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
typedef uint32_t tree;
typedef union hashunit {
uint32_t word;
uint8_t bytes[4];
} hashunit;
typedef struct slot0 {
tree attr;
hashunit hash[HASHWORDS0];
} slot0;
typedef struct slot1 {
tree attr;
hashunit hash[HASHWORDS1];
} slot1;
// a bucket is NSLOTS treenodes
typedef slot0 bucket0[NSLOTS];
typedef slot1 bucket1[NSLOTS];
// the N-bit hash consists of K+1 n-bit "digits"
// each of which corresponds to a layer of NBUCKETS buckets
typedef bucket0 digit0[NBUCKETS];
typedef bucket1 digit1[NBUCKETS];
// manages hash and tree data
typedef struct htalloc {
__global bucket0 *trees0[(WK+1)/2];
__global bucket1 *trees1[WK/2];
} htalloc;
typedef uint32_t bsizes[NBUCKETS];
typedef struct htlayout {
htalloc hta;
uint32_t prevhashunits;
uint32_t nexthashunits;
uint32_t dunits;
uint32_t prevbo;
uint32_t nextbo;
} htlayout;
#if RESTBITS <= 6
typedef uint8_t xslot;
#else
typedef uint16_t xslot;
#endif
typedef struct collisiondata {
#ifdef XBITMAP
#if NSLOTS > 64
#error cant use XBITMAP with more than 64 slots
#endif
uint64_t xhashmap[NRESTS];
uint64_t xmap;
#else
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
uint32_t n0;
uint32_t n1;
#endif
uint32_t s0;
} collisiondata;
typedef struct equi {
blake2b_state blake_ctx;
htalloc hta;
__global bsizes *nslots;
__global proof *sols;
uint32_t nsols;
uint32_t nthreads;
} equi;

File diff suppressed because it is too large Load Diff

View File

@ -1,555 +0,0 @@
# 1 "input.cl"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 1 "<command-line>" 2
# 1 "input.cl"
# 1 "param.h" 1
# 60 "param.h"
typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[2000];
uint values[2000][(1 << 9)];
} sols_t;
# 2 "input.cl" 2
# 36 "input.cl"
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
# 80 "input.cl"
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
# 111 "input.cl"
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
else
row = ((xi0 & 0xf0000) >> 0) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
# 188 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
v[12] ^= 140 + 4 ;
v[14] ^= -1;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
input++;
}
}
# 415 "input.cl"
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2)
{
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
if (round == 2)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
xi2 = (xi2 >> 8);
}
}
else if (round == 3)
{
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
if (round == 4)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 6)
{
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
if (round == 6)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 7 || round == 8)
{
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
if (round == 8)
{
xi0 = (xi0 >> 8);
}
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
# 524 "input.cl"
mask = 0;
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
}
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset);
b = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
if (round < 8)
*(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
# 585 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round8(__global char *ht_src, __global char *ht_dst,
__global uint *debug, __global sols_t *sols)
{
uint tid = get_global_id(0);
equihash_round(8, ht_src, ht_dst, debug);
if (!tid)
sols->nr = sols->likely_invalids = 0;
}
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 +
slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32)
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalids);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

Binary file not shown.

Binary file not shown.

View File

@ -1,526 +0,0 @@
# 1 "input.cl"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 1 "<command-line>" 2
# 1 "input.cl"
# 1 "param.h" 1
# 60 "param.h"
typedef struct sols_s
{
uint nr;
uint likely_invalidss;
uchar valid[2000];
uint values[2000][(1 << 9)];
} sols_t;
# 2 "input.cl" 2
# 35 "input.cl"
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32) = 0;
}
# 79 "input.cl"
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
if (!(round % 2))
row = (xi0 & 0xffff);
else
row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
# 119 "input.cl"
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
# 187 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
v[12] ^= 140 + 4 ;
v[14] ^= -1;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
input++;
}
}
# 409 "input.cl"
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2)
{
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
}
else if (round == 3)
{
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
}
else if (round == 6)
{
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
}
else if (round == 7 || round == 8)
{
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 16) | ((slot_b & 0xff) << 8) | (slot_a & 0xff)),
xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 2];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
mask = ((!(round % 2)) ? 0x0f : 0xf0);
# 499 "input.cl"
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
}
uint adj = (!(round % 2)) ? 1 : 0;
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + i * 32 + xi_offset
+ adj);
b = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + j * 32 + xi_offset
+ adj);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
}
# 557 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 +
slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 16), ((ins[i] >> 8) & 0xff));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 16), (ins[i] & 0xff));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
if (tid == 0)
sols->nr = sols->likely_invalidss = 0;
mem_fence(CLK_GLOBAL_MEM_FENCE);
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32)
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalidss);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

View File

@ -1,531 +0,0 @@
# 1 "input.cl"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 1 "<command-line>" 2
# 1 "input.cl"
# 1 "param.h" 1
# 60 "param.h"
typedef struct sols_s
{
uint nr;
uint likely_invalidss;
uchar valid[2000];
uint values[2000][(1 << 9)];
} sols_t;
# 2 "input.cl" 2
# 35 "input.cl"
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32) = 0;
}
# 79 "input.cl"
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
# 103 "input.cl"
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
else
row = ((xi0 & 0xe0000) >> 1) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
# 119 "input.cl"
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
# 187 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
v[12] ^= 140 + 4 ;
v[14] ^= -1;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
input++;
}
}
# 409 "input.cl"
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2)
{
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
}
else if (round == 3)
{
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
}
else if (round == 6)
{
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
}
else if (round == 7 || round == 8)
{
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 13) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 2];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
mask = ((!(round % 2)) ? 0x01 : 0x10);
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
}
uint adj = (!(round % 2)) ? 1 : 0;
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + i * 32 + xi_offset
+ adj);
b = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + j * 32 + xi_offset
+ adj);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
}
# 557 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 +
slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 13), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 13), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
if (tid == 0)
sols->nr = sols->likely_invalidss = 0;
mem_fence(CLK_GLOBAL_MEM_FENCE);
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32)
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalidss);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

View File

@ -1,526 +0,0 @@
# 1 "input.cl"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 1 "<command-line>" 2
# 1 "input.cl"
# 1 "param.h" 1
# 60 "param.h"
typedef struct sols_s
{
uint nr;
uint likely_invalidss;
uchar valid[2000];
uint values[2000][(1 << 9)];
} sols_t;
# 2 "input.cl" 2
# 35 "input.cl"
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32) = 0;
}
# 79 "input.cl"
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
# 110 "input.cl"
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
else
row = ((xi0 & 0xf0000) >> 0) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
# 187 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
v[12] ^= 140 + 4 ;
v[14] ^= -1;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
input++;
}
}
# 409 "input.cl"
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2)
{
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
}
else if (round == 3)
{
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
}
else if (round == 6)
{
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
}
else if (round == 7 || round == 8)
{
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 2];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
# 495 "input.cl"
mask = 0;
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
}
uint adj = (!(round % 2)) ? 1 : 0;
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + i * 32 + xi_offset
+ adj);
b = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + j * 32 + xi_offset
+ adj);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
}
# 557 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 +
slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
if (tid == 0)
sols->nr = sols->likely_invalidss = 0;
mem_fence(CLK_GLOBAL_MEM_FENCE);
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32)
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalidss);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

View File

@ -1,2 +0,0 @@
#include "AvailableSolvers.h"

View File

@ -21,6 +21,7 @@ CREATE_SOLVER_STUB(cpu_xenoncat, "cpu_xenoncat_STUB")
CREATE_SOLVER_STUB(cuda_tromp, "cuda_tromp_STUB")
CREATE_SOLVER_STUB(cuda_djezo, "cuda_djezo_STUB")
#endif
// OpenCL solvers are fropped replace with new OS solvers
#ifdef USE_OCL_XMP
#include "../ocl_xpm/ocl_xmp.hpp"
#else

View File

@ -5,12 +5,9 @@ VisualStudioVersion = 12.0.40629.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nheqminer", "nheqminer.vcxproj", "{6FF7D209-05A3-4550-93CC-211D33503719}"
ProjectSection(ProjectDependencies) = postProject
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48} = {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}
{299E011B-5242-4EDA-B2F2-73C9B48F12FD} = {299E011B-5242-4EDA-B2F2-73C9B48F12FD}
{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B} = {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}
{33C2B469-F025-4223-B9B6-E69D42FEA7D6} = {33C2B469-F025-4223-B9B6-E69D42FEA7D6}
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135} = {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}
{5EC9EDEB-8E49-4126-9161-1560683CBC71} = {5EC9EDEB-8E49-4126-9161-1560683CBC71}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_tromp", "..\cuda_tromp\cuda_tromp.vcxproj", "{33C2B469-F025-4223-B9B6-E69D42FEA7D6}"
@ -19,12 +16,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_xenoncat", "..\cpu_xeno
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_tromp", "..\cpu_tromp\cpu_tromp.vcxproj", "{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_xpm", "..\ocl_xpm\ocl_xpm.vcxproj", "{5EC9EDEB-8E49-4126-9161-1560683CBC71}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_device_utils", "..\ocl_device_utils\ocl_device_utils.vcxproj", "{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_silentarmy", "..\ocl_silentarmy\ocl_silentarmy.vcxproj", "{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_djezo", "..\cuda_djezo\cuda_djezo.vcxproj", "{268B10AD-D845-498B-8663-AB8911CA2039}"
EndProject
Global
@ -73,33 +64,6 @@ Global
{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Win32.ActiveCfg = ReleaseSSE2|x64
{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.ActiveCfg = ReleaseSSE2|x64
{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.Build.0 = ReleaseSSE2|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Win32.ActiveCfg = Debug|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.ActiveCfg = Debug|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.Build.0 = Debug|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Win32.ActiveCfg = Release|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.ActiveCfg = Release|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.Build.0 = Release|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Win32.ActiveCfg = Release|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.ActiveCfg = Release|x64
{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.Build.0 = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Win32.ActiveCfg = Debug|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.ActiveCfg = Debug|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.Build.0 = Debug|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Win32.ActiveCfg = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.ActiveCfg = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.Build.0 = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Win32.ActiveCfg = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.ActiveCfg = Release|x64
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.Build.0 = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Win32.ActiveCfg = Debug|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.ActiveCfg = Debug|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.Build.0 = Debug|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Win32.ActiveCfg = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.ActiveCfg = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.Build.0 = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Win32.ActiveCfg = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.ActiveCfg = Release|x64
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.Build.0 = Release|x64
{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|Win32.ActiveCfg = Debug|x64
{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.ActiveCfg = Debug|x64
{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.Build.0 = Debug|x64

View File

@ -84,7 +84,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<AdditionalOptions>-D_WIN32_WINNT=0x0601 %(AdditionalOptions)</AdditionalOptions>
<DisableSpecificWarnings>4068;4996;4503;4267;4180;4290;4244;4800;4334;4251</DisableSpecificWarnings>
@ -153,7 +153,6 @@
<ClCompile Include="amount.cpp" />
<ClCompile Include="api.cpp" />
<ClCompile Include="arith_uint256.cpp" />
<ClCompile Include="AvailableSolvers.cpp" />
<ClCompile Include="crypto\sha256.cpp" />
<ClCompile Include="json\json_spirit_reader.cpp">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/bigobj %(AdditionalOptions)</AdditionalOptions>

View File

@ -226,9 +226,6 @@
<ClCompile Include="utilstrencodings.cpp">
<Filter>Source Files\stuff</Filter>
</ClCompile>
<ClCompile Include="AvailableSolvers.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="MinerFactory.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -1,15 +0,0 @@
#pragma once
#include <string>
// This will list OpenCL devices, but AMD will only have aditional BusID
struct OpenCLDevice {
unsigned int DeviceID;
std::string _CL_DEVICE_NAME;
std::string _CL_DEVICE_TYPE;
unsigned long long _CL_DEVICE_GLOBAL_MEM_SIZE;
std::string _CL_DEVICE_VENDOR;
std::string _CL_DEVICE_VERSION;
std::string _CL_DRIVER_VERSION;
};

File diff suppressed because it is too large Load Diff

View File

@ -1,146 +0,0 @@
#include "ocl_device_utils.h"
#include <iostream>
#include <stdexcept>
#include <utility>
#include <algorithm>
using namespace std;
using namespace cl;
bool ocl_device_utils::_hasQueried = false;
std::vector<std::string> ocl_device_utils::_platformNames;
std::vector<PrintInfo> ocl_device_utils::_devicesPlatformsDevices;
vector<Platform> ocl_device_utils::getPlatforms() {
vector<Platform> platforms;
try {
Platform::get(&platforms);
}
catch (Error const& err) {
#if defined(CL_PLATFORM_NOT_FOUND_KHR)
if (err.err() == CL_PLATFORM_NOT_FOUND_KHR)
cout << "No OpenCL platforms found" << endl;
else
#endif
throw err;
}
return platforms;
}
void ocl_device_utils::print_opencl_devices() {
ocl_device_utils::QueryDevices();
ocl_device_utils::PrintDevices();
}
vector<Device> ocl_device_utils::getDevices(vector<Platform> const& _platforms, unsigned _platformId) {
vector<Device> devices;
try {
_platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices);
}
catch (Error const& err) {
// if simply no devices found return empty vector
if (err.err() != CL_DEVICE_NOT_FOUND)
throw err;
}
return devices;
}
string ocl_device_utils::StringnNullTerminatorFix(const string& str) {
return string(str.c_str(), strlen(str.c_str()));
}
bool ocl_device_utils::QueryDevices() {
if (!_hasQueried) {
_hasQueried = true;
try {
// get platforms
auto platforms = getPlatforms();
if (platforms.empty()) {
cout << "No OpenCL platforms found" << endl;
return false;
}
else {
for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) {
string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo<CL_PLATFORM_NAME>());
if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) {
PrintInfo current;
_platformNames.push_back(platformName);
// new
current.PlatformName = platformName;
current.PlatformNum = i_pId;
auto clDevs = getDevices(platforms, i_pId);
for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) {
OpenCLDevice curDevice;
curDevice.DeviceID = i_devId;
curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_NAME>());
switch (clDevs[i_devId].getInfo<CL_DEVICE_TYPE>()) {
case CL_DEVICE_TYPE_CPU:
curDevice._CL_DEVICE_TYPE = "CPU";
break;
case CL_DEVICE_TYPE_GPU:
curDevice._CL_DEVICE_TYPE = "GPU";
break;
case CL_DEVICE_TYPE_ACCELERATOR:
curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
break;
default:
curDevice._CL_DEVICE_TYPE = "DEFAULT";
break;
}
curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VENDOR>());
curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VERSION>());
curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DRIVER_VERSION>());
current.Devices.push_back(curDevice);
}
_devicesPlatformsDevices.push_back(current);
}
}
}
}
catch (exception &ex) {
// TODO
cout << "ocl_device_utils::QueryDevices() exception: " << ex.what() << endl;
return false;
}
return true;
}
return false;
}
int ocl_device_utils::GetCountForPlatform(int platformID) {
for (const auto &platInfo : _devicesPlatformsDevices)
{
if (platformID == platInfo.PlatformNum) {
return platInfo.Devices.size();
}
}
return 0;
}
void ocl_device_utils::PrintDevices() {
int allDevsCount = 0;
for (const auto &platInfo : _devicesPlatformsDevices) {
allDevsCount += platInfo.Devices.size();
}
cout << "Number of OpenCL devices found: " << allDevsCount << endl;
{
int devPlatformsComma = _devicesPlatformsDevices.size();
for (const auto &platInfo : _devicesPlatformsDevices) {
cout << "\tPlatform: " << platInfo.PlatformName << " | " << "PlatformNum: " << platInfo.PlatformNum << endl;
cout << "\t\tDevices: " << endl;
// device print
int devComma = platInfo.Devices.size();
for (const auto &dev : platInfo.Devices) {
cout << "\t\t\t#" << dev.DeviceID << " " << dev._CL_DEVICE_NAME << " | " << dev._CL_DEVICE_TYPE << endl;
}
}
}
}

View File

@ -1,34 +0,0 @@
#pragma once
#define __CL_ENABLE_EXCEPTIONS
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include "cl_ext.hpp"
#include <map>
#include <vector>
#include "OpenCLDevice.h"
struct PrintInfo {
std::string PlatformName;
int PlatformNum;
std::vector<OpenCLDevice> Devices;
};
class ocl_device_utils {
public:
static bool QueryDevices();
static void PrintDevices();
static int GetCountForPlatform(int platformID);
static void print_opencl_devices();
private:
static std::vector<cl::Device> getDevices(std::vector<cl::Platform> const& _platforms, unsigned _platformId);
static std::vector<cl::Platform> getPlatforms();
static bool _hasQueried;
static std::vector<std::string> _platformNames;
static std::vector<PrintInfo> _devicesPlatformsDevices;
static std::string StringnNullTerminatorFix(const std::string& str);
};

View File

@ -1,95 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClInclude Include="cl_ext.hpp" />
<ClInclude Include="ocl_device_utils.h" />
<ClInclude Include="opencl.h" />
<ClInclude Include="OpenCLDevice.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="ocl_device_utils.cpp" />
<ClCompile Include="opencl.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>ocl_device_utils</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
<IntDir>$(Platform)\$(Configuration)\</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
<IntDir>$(Platform)\$(Configuration)\</IntDir>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(AMDAPPSDKROOT)\include\</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -1,13 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClInclude Include="cl_ext.hpp" />
<ClInclude Include="ocl_device_utils.h" />
<ClInclude Include="OpenCLDevice.h" />
<ClInclude Include="opencl.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="ocl_device_utils.cpp" />
<ClCompile Include="opencl.cpp" />
</ItemGroup>
</Project>

View File

@ -1,174 +0,0 @@
#include "opencl.h"
#include <fstream>
#include <vector>
#include <memory>
#include <stdio.h>
extern cl_platform_id gPlatform;
// extern cl_program gProgram;
bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus)
{
cl_platform_id platforms[64];
cl_uint numPlatforms;
OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false);
if (!numPlatforms) {
printf("<error> no OpenCL platforms found\n");
return false;
}
/*int platformIdx = -1;
if (requiredPlatform) {
for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) {
char name[1024] = {0};
OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false);
printf("found platform[%i] name = '%s'\n", (int)i, name);
if (strcmp(name, requiredPlatform) == 0) {
platformIdx = i;
break;
}
}
} else {
platformIdx = 0;
}*/
int platformIdx = requiredPlatform;
if (platformIdx == -1) {
printf("<error> platform %s not exists\n", requiredPlatform);
return false;
}
gPlatform = platforms[platformIdx];
cl_uint numDevices = 0;
cl_device_id devices[64];
clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices);
if (numDevices) {
printf("<info> found %d devices\n", numDevices);
} else {
printf("<error> no OpenCL GPU devices found.\n");
return false;
}
for (decltype(numDevices) i = 0; i < numDevices; i++) {
gpus.push_back(devices[i]);
}
return true;
}
bool clCompileKernel(cl_context gContext,
cl_device_id gpu,
const char *binaryName,
const std::vector<const char*> &sources,
const char *arguments,
cl_int *binstatus,
cl_program *gProgram)
{
std::ifstream testfile(binaryName);
// size_t binsizes[64];
// const unsigned char *binaries[64];
if(!testfile) {
printf("<info> compiling ...\n");
std::string sourceFile;
for (auto &i: sources) {
std::ifstream stream;
stream.exceptions(std::ifstream::failbit | std::ifstream::badbit);
try {
stream.open(i);
} catch (std::system_error& e) {
fprintf(stderr, "<error> %s\n", e.code().message().c_str());
return false;
}
std::string str((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
sourceFile.append(str);
}
printf("<info> source: %u bytes\n", (unsigned)sourceFile.size());
if(sourceFile.size() < 1){
fprintf(stderr, "<error> source files not found or empty\n");
return false;
}
cl_int error;
const char *sources[] = { sourceFile.c_str(), 0 };
*gProgram = clCreateProgramWithSource(gContext, 1, sources, 0, &error);
OCLR(error, false);
if (clBuildProgram(*gProgram, 1, &gpu, arguments, 0, 0) != CL_SUCCESS) {
size_t logSize;
clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);
std::unique_ptr<char[]> log(new char[logSize]);
clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, logSize, log.get(), 0);
printf("%s\n", log.get());
return false;
}
size_t binsize;
OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binsize, 0), false);
// for (size_t i = 0; i < 1; i++) {
if(!binsize) {
printf("<error> no binary available!\n");
return false;
}
// }
printf("<info> binsize = %u bytes\n", (unsigned)binsize);
// std::unique_ptr<unsigned char[]> binary(new unsigned char[binsize+1]);
// for (size_t i = 0; i < gpus.size(); i++)
std::unique_ptr<unsigned char[]> binary(new unsigned char[binsize+1]);
// binaries[i] = new unsigned char[binsizes[i]];
// for (auto &b: binaries)
// b = binary.get();
OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARIES, sizeof(void*), &binary, 0), false);
{
std::ofstream bin(binaryName, std::ofstream::binary | std::ofstream::trunc);
bin.write((const char*)binary.get(), binsize);
bin.close();
}
OCLR(clReleaseProgram(*gProgram), false);
}
std::ifstream bfile(binaryName, std::ifstream::binary);
if(!bfile) {
printf("<error> %s not found\n", binaryName);
return false;
}
bfile.seekg(0, bfile.end);
size_t binsize = bfile.tellg();
bfile.seekg(0, bfile.beg);
if(!binsize){
printf("<error> %s empty\n", binaryName);
return false;
}
std::vector<char> binary(binsize+1);
bfile.read(&binary[0], binsize);
bfile.close();
cl_int error;
// binstatus.resize(gpus.size(), 0);
// std::vector<size_t> binsizes(gpus.size(), binsize);
// std::vector<const unsigned char*> binaries(gpus.size(), (const unsigned char*)&binary[0]);
const unsigned char *binaryPtr = (const unsigned char*)&binary[0];
*gProgram = clCreateProgramWithBinary(gContext, 1, &gpu, &binsize, &binaryPtr, binstatus, &error);
OCLR(error, false);
OCLR(clBuildProgram(*gProgram, 1, &gpu, 0, 0, 0), false);
return true;
}

View File

@ -1,131 +0,0 @@
/*
* opencl.h
*
* Created on: 01.05.2014
* Author: mad
*/
#ifndef OPENCL_H_
#define OPENCL_H_
#pragma warning(disable: 4996)
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <vector>
// extern cl_context gContext;
#define OCL(error) \
if(cl_int err = error){ \
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
return; \
}
#define OCLR(error, ret) \
if(cl_int err = error){ \
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
return ret; \
}
#define OCLE(error) \
if(cl_int err = error){ \
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
exit(err); \
}
template<typename T>
class clBuffer {
public:
clBuffer() {
Size = 0;
HostData = 0;
DeviceData = 0;
}
~clBuffer() {
if(HostData)
delete [] HostData;
if(DeviceData)
clReleaseMemObject(DeviceData);
}
void init(cl_context gContext, int size, cl_mem_flags flags = 0) {
Size = size;
if(!(flags & CL_MEM_HOST_NO_ACCESS)){
HostData = new T[Size];
memset(HostData, 0, Size*sizeof(T));
}else
HostData = 0;
//printf("clCreateBuffer: size = %d, %d bytes\n", Size, Size*sizeof(T));
cl_int error;
if (flags & CL_MEM_HOST_NO_ACCESS)
flags = CL_MEM_READ_WRITE;
DeviceData = clCreateBuffer(gContext, flags, Size*sizeof(T), 0, &error);
OCL(error);
}
void copyToDevice(cl_command_queue cq, bool blocking = true) {
OCL(clEnqueueWriteBuffer(cq, DeviceData, blocking, 0, Size*sizeof(T), HostData, 0, 0, 0));
}
void copyToHost(cl_command_queue cq, bool blocking = true, unsigned size = 0) {
if(size == 0)
size = Size;
OCL(clEnqueueReadBuffer(cq, DeviceData, blocking, 0, size*sizeof(T), HostData, 0, 0, 0));
}
T& get(int index) {
return HostData[index];
}
T& operator[](int index) {
return HostData[index];
}
public:
int Size;
T* HostData;
cl_mem DeviceData;
};
bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus);
bool clCompileKernel(cl_context gContext,
cl_device_id gpu,
const char *binaryName,
const std::vector<const char*> &sources,
const char *arguments,
cl_int *binstatus,
cl_program *gProgram);
#endif /* OPENCL_H_ */

View File

@ -1,536 +0,0 @@
#include "ocl_silentarmy.hpp"
//#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <sys/types.h>
//#include <sys/time.h>
#include <sys/stat.h>
#include <fcntl.h>
//#include <unistd.h>
//#include <getopt.h>
#include <errno.h>
#include "opencl.h"
#include <fstream>
#include "sa_blake.h"
typedef uint8_t uchar;
typedef uint32_t uint;
typedef uint64_t ulong;
#include "param.h"
#define MIN(A, B) (((A) < (B)) ? (A) : (B))
#define MAX(A, B) (((A) > (B)) ? (A) : (B))
#define WN PARAM_N
#define WK PARAM_K
#define COLLISION_BIT_LENGTH (WN / (WK+1))
#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
#define NDIGITS (WK+1)
#define DIGITBITS (WN/(NDIGITS))
#define PROOFSIZE (1u<<WK)
#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
typedef struct debug_s
{
uint32_t dropped_coll;
uint32_t dropped_stor;
} debug_t;
struct OclContext {
cl_context _context;
cl_program _program;
cl_device_id _dev_id;
cl_platform_id platform_id = 0;
cl_command_queue queue;
cl_kernel k_init_ht;
cl_kernel k_rounds[PARAM_K];
cl_kernel k_sols;
cl_mem buf_ht[2], buf_sols, buf_dbg;
size_t global_ws;
size_t local_work_size = 64;
sols_t *sols;
bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
~OclContext() {
clReleaseMemObject(buf_dbg);
clReleaseMemObject(buf_ht[0]);
clReleaseMemObject(buf_ht[1]);
free(sols);
}
};
cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
void *host_ptr);
bool OclContext::init(
cl_device_id dev,
unsigned int threadsNum,
unsigned int threadsPerBlock)
{
cl_int error;
queue = clCreateCommandQueue(_context, dev, 0, &error);
#ifdef ENABLE_DEBUG
size_t dbg_size = NR_ROWS;
#else
size_t dbg_size = 1;
#endif
buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE |
CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t),
NULL);
fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * HT_SIZE / 1e6);
k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
for (unsigned i = 0; i < WK; i++) {
char kernelName[128];
sprintf(kernelName, "kernel_round%d", i);
k_rounds[i] = clCreateKernel(_program, kernelName, &error);
}
sols = (sols_t *)malloc(sizeof(*sols));
k_sols = clCreateKernel(_program, "kernel_sols", &error);
return true;
}
///
int verbose = 0;
uint32_t show_encoded = 0;
cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
void *host_ptr)
{
cl_int status;
cl_mem ret;
ret = clCreateBuffer(ctx, flags, size, host_ptr, &status);
if (status != CL_SUCCESS || !ret)
printf("clCreateBuffer (%d)\n", status);
return ret;
}
void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a)
{
cl_int status;
status = clSetKernelArg(k, a_pos, sizeof(*a), a);
if (status != CL_SUCCESS)
printf("clSetKernelArg (%d)\n", status);
}
void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint
work_dim, const size_t *global_work_offset, const size_t
*global_work_size, const size_t *local_work_size, cl_uint
num_events_in_wait_list, const cl_event *event_wait_list, cl_event
*event)
{
cl_uint status;
status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset,
global_work_size, local_work_size, num_events_in_wait_list,
event_wait_list, event);
if (status != CL_SUCCESS)
printf("clEnqueueNDRangeKernel (%d)\n", status);
}
void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool
blocking_read, size_t offset, size_t size, void *ptr, cl_uint
num_events_in_wait_list, const cl_event *event_wait_list, cl_event
*event)
{
cl_int status;
status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset,
size, ptr, num_events_in_wait_list, event_wait_list, event);
if (status != CL_SUCCESS)
printf("clEnqueueReadBuffer (%d)\n", status);
}
void hexdump(uint8_t *a, uint32_t a_len)
{
for (uint32_t i = 0; i < a_len; i++)
fprintf(stderr, "%02x", a[i]);
}
char *s_hexdump(const void *_a, uint32_t a_len)
{
const uint8_t *a = (uint8_t *)_a;
static char buf[1024];
uint32_t i;
for (i = 0; i < a_len && i + 2 < sizeof(buf); i++)
sprintf(buf + i * 2, "%02x", a[i]);
buf[i * 2] = 0;
return buf;
}
uint8_t hex2val(const char *base, size_t off)
{
const char c = base[off];
if (c >= '0' && c <= '9') return c - '0';
else if (c >= 'a' && c <= 'f') return 10 + c - 'a';
else if (c >= 'A' && c <= 'F') return 10 + c - 'A';
printf("Invalid hex char at offset %zd: ...%c...\n", off, c);
return 0;
}
unsigned nr_compute_units(const char *gpu)
{
if (!strcmp(gpu, "rx480")) return 36;
fprintf(stderr, "Unknown GPU: %s\n", gpu);
return 0;
}
static void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
{
uint32_t byte_pos = 0;
int32_t bits_left = PREFIX + 1;
uint8_t x = 0;
uint8_t x_bits_used = 0;
uint8_t *pOut = out;
while (byte_pos < n)
{
if (bits_left >= 8 - x_bits_used)
{
x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
bits_left -= 8 - x_bits_used;
x_bits_used = 8;
}
else if (bits_left > 0)
{
uint32_t mask = ~(-1 << (8 - x_bits_used));
mask = ((~mask) >> bits_left) & mask;
x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
x_bits_used += bits_left;
bits_left = 0;
}
else if (bits_left <= 0)
{
assert(!bits_left);
byte_pos++;
bits_left = PREFIX + 1;
}
if (x_bits_used == 8)
{
*pOut++ = x;
x = x_bits_used = 0;
}
}
}
void get_program_build_log(cl_program program, cl_device_id device)
{
cl_int status;
char val[2 * 1024 * 1024];
size_t ret = 0;
status = clGetProgramBuildInfo(program, device,
CL_PROGRAM_BUILD_LOG,
sizeof(val), // size_t param_value_size
&val, // void *param_value
&ret); // size_t *param_value_size_ret
if (status != CL_SUCCESS)
printf("clGetProgramBuildInfo (%d)\n", status);
fprintf(stderr, "%s\n", val);
}
size_t select_work_size_blake(void)
{
size_t work_size =
64 * /* thread per wavefront */
BLAKE_WPS * /* wavefront per simd */
4 * /* simd per compute unit */
nr_compute_units("rx480");
// Make the work group size a multiple of the nr of wavefronts, while
// dividing the number of inputs. This results in the worksize being a
// power of 2.
while (NR_INPUTS % work_size)
work_size += 64;
//debug("Blake: work size %zd\n", work_size);
return work_size;
}
static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
{
size_t global_ws = NR_ROWS;
size_t local_ws = 64;
cl_int status;
#if 0
uint32_t pat = -1;
status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
NR_ROWS * NR_SLOTS * SLOT_LEN,
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
if (status != CL_SUCCESS)
fatal("clEnqueueFillBuffer (%d)\n", status);
#endif
status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
if (status != CL_SUCCESS)
printf("clSetKernelArg (%d)\n", status);
check_clEnqueueNDRangeKernel(queue, k_init_ht,
1, // cl_uint work_dim
NULL, // size_t *global_work_offset
&global_ws, // size_t *global_work_size
&local_ws, // size_t *local_work_size
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
}
/*
** Sort a pair of binary blobs (a, b) which are consecutive in memory and
** occupy a total of 2*len 32-bit words.
**
** a points to the pair
** len number of 32-bit words in each pair
*/
void sort_pair(uint32_t *a, uint32_t len)
{
uint32_t *b = a + len;
uint32_t tmp, need_sorting = 0;
for (uint32_t i = 0; i < len; i++)
if (need_sorting || a[i] > b[i])
{
need_sorting = 1;
tmp = a[i];
a[i] = b[i];
b[i] = tmp;
}
else if (a[i] < b[i])
return;
}
static uint32_t verify_sol(sols_t *sols, unsigned sol_i)
{
uint32_t *inputs = sols->values[sol_i];
uint32_t seen_len = (1 << (PREFIX + 1)) / 8;
uint8_t seen[(1 << (PREFIX + 1)) / 8];
uint32_t i;
uint8_t tmp;
// look for duplicate inputs
memset(seen, 0, seen_len);
for (i = 0; i < (1 << PARAM_K); i++)
{
tmp = seen[inputs[i] / 8];
seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
if (tmp == seen[inputs[i] / 8])
{
// at least one input value is a duplicate
sols->valid[sol_i] = 0;
return 0;
}
}
// the valid flag is already set by the GPU, but set it again because
// I plan to change the GPU code to not set it
sols->valid[sol_i] = 1;
// sort the pairs in place
for (uint32_t level = 0; level < PARAM_K; level++)
for (i = 0; i < (1 << PARAM_K); i += (2 << level))
sort_pair(&inputs[i], 1 << level);
return 1;
}
ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) {
platform_id = platf_id;
device_id = dev_id;
// TODO
threadsNum = 8192;
wokrsize = 128; // 256;
}
std::string ocl_silentarmy::getdevinfo() {
/*TODO get name*/
return "GPU_ID(" + std::to_string(device_id)+ ")";
}
// STATICS START
int ocl_silentarmy::getcount() { /*TODO*/
return 0;
}
void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ }
void ocl_silentarmy::start(ocl_silentarmy& device_context) {
/*TODO*/
device_context.is_init_success = false;
device_context.oclc = new OclContext();
std::vector<cl_device_id> allGpus;
if (!clInitialize(device_context.platform_id, allGpus)) {
return;
}
// this is kinda stupid but it works
std::vector<cl_device_id> gpus;
for (unsigned i = 0; i < allGpus.size(); ++i) {
if (i == device_context.device_id) {
printf("Using device %d as GPU %d\n", i, (int)gpus.size());
device_context.oclc->_dev_id = allGpus[i];
gpus.push_back(allGpus[i]);
}
}
if (!gpus.size()){
printf("Device id %d not found\n", device_context.device_id);
return;
}
// context create
for (unsigned i = 0; i < gpus.size(); i++) {
cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
cl_int error;
device_context.oclc->_context = clCreateContext(NULL, 1, &gpus[i], 0, 0, &error);
//OCLR(error, false);
if (cl_int err = error) {
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
return;
}
}
std::vector<cl_int> binstatus;
binstatus.resize(gpus.size());
for (size_t i = 0; i < gpus.size(); i++) {
char kernelName[64];
sprintf(kernelName, "silentarmy_gpu%u.bin", (unsigned)i);
if (!clCompileKernel(device_context.oclc->_context,
gpus[i],
kernelName,
{ "zcash/gpu/kernel.cl" },
"",
&binstatus[i],
&device_context.oclc->_program)) {
return;
}
}
for (unsigned i = 0; i < gpus.size(); ++i) {
if (binstatus[i] == CL_SUCCESS) {
if (!device_context.oclc->init(gpus[i], device_context.threadsNum, device_context.wokrsize)) {
printf("Init failed");
return;
}
}
else {
printf("GPU %d: failed to load kernel\n", i);
return;
}
}
device_context.is_init_success = true;
}
void ocl_silentarmy::stop(ocl_silentarmy& device_context) {
if (device_context.oclc != nullptr) delete device_context.oclc;
}
void ocl_silentarmy::solve(const char *tequihash_header,
unsigned int tequihash_header_len,
const char* nonce,
unsigned int nonce_len,
std::function<bool()> cancelf,
std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
std::function<void(void)> hashdonef,
ocl_silentarmy& device_context) {
unsigned char context[140];
memset(context, 0, 140);
memcpy(context, tequihash_header, tequihash_header_len);
memcpy(context + tequihash_header_len, nonce, nonce_len);
OclContext *miner = device_context.oclc;
clFlush(miner->queue);
blake2b_state_t initialCtx;
zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K);
zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
cl_mem buf_blake_st;
buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx);
for (unsigned round = 0; round < PARAM_K; round++)
{
if (round < 2)
init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round % 2]);
if (!round)
{
check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]);
miner->global_ws = select_work_size_blake();
}
else
{
check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) % 2]);
check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]);
miner->global_ws = NR_ROWS;
}
check_clSetKernelArg(miner->k_rounds[round], 2, &miner->buf_dbg);
if (round == PARAM_K - 1)
check_clSetKernelArg(miner->k_rounds[round], 3, &miner->buf_sols);
check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
// cancel function
if (cancelf()) return;
}
check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols);
miner->global_ws = NR_ROWS;
check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
check_clEnqueueReadBuffer(miner->queue, miner->buf_sols,
CL_TRUE, // cl_bool blocking_read
0, // size_t offset
sizeof(*miner->sols), // size_t size
miner->sols, // void *ptr
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
if (miner->sols->nr > MAX_SOLS)
miner->sols->nr = MAX_SOLS;
clReleaseMemObject(buf_blake_st);
for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) {
verify_sol(miner->sols, sol_i);
}
uint8_t proof[COMPRESSED_PROOFSIZE * 2];
for (uint32_t i = 0; i < miner->sols->nr; i++) {
if (miner->sols->valid[i]) {
compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << PARAM_K);
solutionf(std::vector<uint32_t>(0), 1344, proof);
}
}
hashdonef();
}
// STATICS END

View File

@ -1,58 +0,0 @@
#pragma once
#ifdef _LIB
#define DLL_OCL_SILENTARMY __declspec(dllexport)
#else
#define DLL_OCL_SILENTARMY
#endif
// remove after
#include <string>
#include <functional>
#include <vector>
#include <cstdint>
struct OclContext;
struct DLL_OCL_SILENTARMY ocl_silentarmy
{
//int threadsperblock;
int blocks;
int device_id;
int platform_id;
OclContext* oclc;
// threads
unsigned threadsNum; // TMP
unsigned wokrsize;
bool is_init_success = false;
ocl_silentarmy(int platf_id, int dev_id);
std::string getdevinfo();
static int getcount();
static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
static void start(ocl_silentarmy& device_context);
static void stop(ocl_silentarmy& device_context);
static void solve(const char *tequihash_header,
unsigned int tequihash_header_len,
const char* nonce,
unsigned int nonce_len,
std::function<bool()> cancelf,
std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
std::function<void(void)> hashdonef,
ocl_silentarmy& device_context);
std::string getname() { return "OCL_SILENTARMY"; }
private:
std::string m_gpu_name;
std::string m_version;
};

View File

@ -1,98 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="ocl_silentarmy.cpp" />
<ClCompile Include="sa_blake.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="ocl_silentarmy.hpp" />
<ClInclude Include="param.h" />
<ClInclude Include="sa_blake.h" />
</ItemGroup>
<ItemGroup>
<None Include="zcash\gpu\input.cl" />
<None Include="zcash\gpu\kernel.cl" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>ocl_silentarmy</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
<IntDir>$(Platform)\$(Configuration)\</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<IntDir>$(Platform)\$(Configuration)\</IntDir>
<OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="ocl_silentarmy.cpp" />
<ClCompile Include="sa_blake.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="ocl_silentarmy.hpp" />
<ClInclude Include="param.h" />
<ClInclude Include="sa_blake.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="zcash">
<UniqueIdentifier>{34381c66-ca5c-4daa-aa30-58dcf33e2d66}</UniqueIdentifier>
</Filter>
<Filter Include="zcash\gpu">
<UniqueIdentifier>{c7687099-e206-4d36-8836-f7032bffc7da}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<None Include="zcash\gpu\input.cl">
<Filter>zcash\gpu</Filter>
</None>
<None Include="zcash\gpu\kernel.cl">
<Filter>zcash\gpu</Filter>
</None>
</ItemGroup>
</Project>

View File

@ -1,66 +0,0 @@
#define PARAM_N 200
#define PARAM_K 9
#define PREFIX (PARAM_N / (PARAM_K + 1))
#define NR_INPUTS (1 << PREFIX)
// Approximate log base 2 of number of elements in hash tables
#define APX_NR_ELMS_LOG (PREFIX + 1)
// Number of rows and slots is affected by this. 20 offers the best performance
// but occasionally misses ~1% of solutions.
#define NR_ROWS_LOG 20
// Make hash tables OVERHEAD times larger than necessary to store the average
// number of elements per row. The ideal value is as small as possible to
// reduce memory usage, but not too small or else elements are dropped from the
// hash tables.
//
// The actual number of elements per row is closer to the theoretical average
// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
// smaller.
//
// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
// performance as they cause VRAM channel conflicts.
#if NR_ROWS_LOG == 16
#define OVERHEAD 3
#elif NR_ROWS_LOG == 18
#define OVERHEAD 5
#elif NR_ROWS_LOG == 19
#define OVERHEAD 9
#elif NR_ROWS_LOG == 20
#define OVERHEAD 13
#endif
#define NR_ROWS (1 << NR_ROWS_LOG)
#define NR_SLOTS ((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)
// Length of 1 element (slot) in bytes
#define SLOT_LEN 32
// Total size of hash table
#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN)
// Length of Zcash block header and nonce
#define ZCASH_BLOCK_HEADER_LEN 140
#define ZCASH_NONCE_LEN 32
// Number of bytes Zcash needs out of Blake
#define ZCASH_HASH_LEN 50
// Number of wavefronts per SIMD for the Blake kernel.
// Blake is ALU-bound (beside the atomic counter being incremented) so we need
// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
// instructions. 10 is the max supported by the hw.
#define BLAKE_WPS 10
#define MAX_SOLS 2000
// Optional features
#undef ENABLE_DEBUG
/*
** Return the offset of Xi in bytes from the beginning of the slot.
*/
#define xi_offset_for_round(round) (8 + ((round) / 2) * 4)
// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
#define SOL_SIZE ((1 << PARAM_K) * 4)
typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[MAX_SOLS];
uint values[MAX_SOLS][(1 << PARAM_K)];
} sols_t;

View File

@ -1,104 +0,0 @@
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include "sa_blake.h"
static const uint32_t blake2b_block_len = 128;
static const uint32_t blake2b_rounds = 12;
static const uint64_t blake2b_iv[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
};
static const uint8_t blake2b_sigma[12][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
};
/*
** Init the state according to Zcash parameters.
*/
void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
uint32_t n, uint32_t k)
{
assert(n > k);
assert(hash_len <= 64);
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
for (uint32_t i = 1; i <= 5; i++)
st->h[i] = blake2b_iv[i];
st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
st->bytes = 0;
}
static uint64_t rotr64(uint64_t a, uint8_t bits)
{
return (a >> bits) | (a << (64 - bits));
}
static void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
uint64_t x, uint64_t y)
{
*va = (*va + *vb + x);
*vd = rotr64(*vd ^ *va, 32);
*vc = (*vc + *vd);
*vb = rotr64(*vb ^ *vc, 24);
*va = (*va + *vb + y);
*vd = rotr64(*vd ^ *va, 16);
*vc = (*vc + *vd);
*vb = rotr64(*vb ^ *vc, 63);
}
/*
** Process either a full message block or the final partial block.
** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow.
**
** _msg pointer to message (must be zero-padded to 128 bytes if final block)
** msg_len must be 128 (<= 128 allowed only for final partial block)
** is_final indicate if this is the final block
*/
void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
uint32_t msg_len, uint32_t is_final)
{
const uint64_t *m = (const uint64_t *)_msg;
uint64_t v[16];
assert(msg_len <= 128);
assert(st->bytes <= UINT64_MAX - msg_len);
memcpy(v + 0, st->h, 8 * sizeof (*v));
memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
v[12] ^= (st->bytes += msg_len);
v[14] ^= is_final ? -1 : 0;
for (uint32_t round = 0; round < blake2b_rounds; round++)
{
const uint8_t *s = blake2b_sigma[round];
mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]);
mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]);
mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]);
mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]);
mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]);
mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]);
mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]);
}
for (uint32_t i = 0; i < 8; i++)
st->h[i] ^= v[i] ^ v[i + 8];
}
void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
{
assert(outlen <= 64);
memcpy(out, st->h, outlen);
}

View File

@ -1,11 +0,0 @@
#pragma once
typedef struct blake2b_state_s
{
uint64_t h[8];
uint64_t bytes;
} blake2b_state_t;
void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
uint32_t n, uint32_t k);
void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
uint32_t msg_len, uint32_t is_final);
void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen);

View File

@ -1,704 +0,0 @@
#include "param.h"
/*
** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
** bytes in parens):
**
** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1)
** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3)
** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2)
** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4)
** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3)
** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5)
** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4)
** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6)
** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5)
**
** If the first byte of Xi is 0xAB then:
** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
** 'A' is considered redundant padding as it was used to compute the row #
**
** - cnt is an atomic counter keeping track of the number of used slots.
** it is used in the first slot only; subsequent slots replace it with
** 4 padding bytes
** - i encodes either the 21-bit input value (round 0) or a reference to two
** inputs from the previous round
**
** Formula for Xi length and pad length above:
** > for i in range(9):
** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
**
** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
** is the 4 most significant bits of the last byte of Xi.
*/
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
/*
** Reset counters in hash table.
*/
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * NR_SLOTS * SLOT_LEN) = 0;
}
/*
** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
** aa aa ab bb bb cc cc cd dd... [round 0]
** --------------------
** ...ab bb bb cc cc cd dd... [odd round]
** --------------
** ...cc cc cd dd... [next even round]
** -----
** Bytes underlined are going to be stored in the slot. Preceding bytes
** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
** used to compute the row number.
**
** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
** TODO: update lines below with padding nibbles
** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
**
** Return 0 if successfully stored, or 1 if the row overflowed.
*/
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
#if NR_ROWS_LOG == 16
if (!(round % 2))
row = (xi0 & 0xffff);
else
// if we have in hex: "ab cd ef..." (little endian xi0) then this
// formula computes the row as 0xdebc. it skips the 'a' nibble as it
// is part of the PREFIX. The Xi will be stored starting with "ef...";
// 'e' will be considered padding and 'f' is part of the current PREFIX
row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
#elif NR_ROWS_LOG == 18
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
else
row = ((xi0 & 0xc0000) >> 2) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
#elif NR_ROWS_LOG == 19
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
else
row = ((xi0 & 0xe0000) >> 1) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
#elif NR_ROWS_LOG == 20
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
else
row = ((xi0 & 0xf0000) >> 0) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
#else
#error "unsupported NR_ROWS_LOG"
#endif
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * NR_SLOTS * SLOT_LEN;
cnt = atomic_inc((__global uint *)p);
if (cnt >= NR_SLOTS)
return 1;
p += cnt * SLOT_LEN + xi_offset_for_round(round);
// store "i" (always 4 bytes before Xi)
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
// store 24 bytes
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
// store 20 bytes
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
// store 16 bytes
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
// store 12 bytes
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
// store 8 bytes
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
// store 4 bytes
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
#define mix(va, vb, vc, vd, x, y) \
va = (va + vb + x); \
vd = rotate((vd ^ va), (ulong)64 - 32); \
vc = (vc + vd); \
vb = rotate((vb ^ vc), (ulong)64 - 24); \
va = (va + vb + y); \
vd = rotate((vd ^ va), (ulong)64 - 16); \
vc = (vc + vd); \
vb = rotate((vb ^ vc), (ulong)64 - 63);
/*
** Execute round 0 (blake).
**
** Note: making the work group size less than or equal to the wavefront size
** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
** Memory (LDS) Optimization 2-10" in:
** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
*/
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = NR_INPUTS / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
// shift "i" to occupy the high 32 bits of the second ulong word in the
// message block
ulong word1 = (ulong)input << 32;
// init vector v
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
// mix in length of data
v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
// last block
v[14] ^= -1;
// round 1
mix(v[0], v[4], v[8], v[12], 0, word1);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 2
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], word1, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 3
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, word1);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 4
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, word1);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 5
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, word1);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 6
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], word1, 0);
// round 7
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], word1, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 8
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, word1);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 9
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], word1, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 10
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], word1, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 11
mix(v[0], v[4], v[8], v[12], 0, word1);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], 0, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// round 12
mix(v[0], v[4], v[8], v[12], 0, 0);
mix(v[1], v[5], v[9], v[13], 0, 0);
mix(v[2], v[6], v[10], v[14], 0, 0);
mix(v[3], v[7], v[11], v[15], 0, 0);
mix(v[0], v[5], v[10], v[15], word1, 0);
mix(v[1], v[6], v[11], v[12], 0, 0);
mix(v[2], v[7], v[8], v[13], 0, 0);
mix(v[3], v[4], v[9], v[14], 0, 0);
// compress v into the blake state; this produces the 50-byte hash
// (two Xi values)
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
// store the two Xi values in the hash table
#if ZCASH_HASH_LEN == 50
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
#else
#error "unsupported ZCASH_HASH_LEN"
#endif
input++;
}
#ifdef ENABLE_DEBUG
debug[tid * 2] = 0;
debug[tid * 2 + 1] = dropped;
#endif
}
#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
#define ENCODE_INPUTS(row, slot0, slot1) \
((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
#define DECODE_ROW(REF) (REF >> 16)
#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
#define DECODE_SLOT0(REF) (REF & 0xff)
#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
#define ENCODE_INPUTS(row, slot0, slot1) \
((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
#define DECODE_ROW(REF) (REF >> 14)
#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
#define DECODE_SLOT0(REF) (REF & 0x7f)
#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
#define ENCODE_INPUTS(row, slot0, slot1) \
((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
#define DECODE_ROW(REF) (REF >> 13)
#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
#define DECODE_SLOT0(REF) (REF & 0x3f)
#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
#define ENCODE_INPUTS(row, slot0, slot1) \
((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
#define DECODE_ROW(REF) (REF >> 12)
#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
#define DECODE_SLOT0(REF) (REF & 0x3f)
#else
#error "unsupported NR_ROWS_LOG"
#endif
/*
** XOR a pair of Xi values computed at "round - 1" and store the result in the
** hash table being built for "round". Note that when building the table for
** even rounds we need to skip 1 padding byte present in the "round - 1" table
** (the "0xAB" byte mentioned in the description at the top of this file.) But
** also note we can't load data directly past this byte because this would
** cause an unaligned memory access which is undefined per the OpenCL spec.
**
** Return 0 if successfully stored, or 1 if the row overflowed.
*/
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
// Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
// storing the byte containing bits from the previous PREFIX block for
if (round == 1 || round == 2)
{
// xor 24 bytes
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
if (round == 2)
{
// skip padding byte
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
xi2 = (xi2 >> 8);
}
}
else if (round == 3)
{
// xor 20 bytes
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
// xor 16 bytes
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
if (round == 4)
{
// skip padding byte
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 6)
{
// xor 12 bytes
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
if (round == 6)
{
// skip padding byte
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 7 || round == 8)
{
// xor 8 bytes
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
if (round == 8)
{
// skip padding byte
xi0 = (xi0 >> 8);
}
}
// invalid solutions (which start happenning in round 5) have duplicate
// inputs and xor to zero, so discard them
if (!xi0 && !xi1)
return 0;
#else
#error "unsupported NR_ROWS_LOG"
#endif
return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
xi0, xi1, xi2, 0);
}
/*
** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
** store them in ht_dst.
*/
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[NR_SLOTS];
uchar mask;
uint i, j;
// NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
// make it even larger
ushort collisions[NR_SLOTS * 3];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
// read first words of Xi from the previous (round - 1) hash table
xi_offset = xi_offset_for_round(round - 1);
// the mask is also computed to read data from the previous round
#if NR_ROWS_LOG == 16
mask = ((!(round % 2)) ? 0x0f : 0xf0);
#elif NR_ROWS_LOG == 18
mask = ((!(round % 2)) ? 0x03 : 0x30);
#elif NR_ROWS_LOG == 19
mask = ((!(round % 2)) ? 0x01 : 0x10);
#elif NR_ROWS_LOG == 20
mask = 0; /* we can vastly simplify the code below */
#else
#error "unsupported NR_ROWS_LOG"
#endif
p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
p += xi_offset;
for (i = 0; i < cnt; i++, p += SLOT_LEN)
first_words[i] = *(__global uchar *)p;
// find collisions
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
// collision!
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
#if NR_SLOTS <= (1 << 8)
// note: this assumes slots can be encoded in 8 bits
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
#else
#error "unsupported NR_SLOTS"
#endif
}
// XOR colliding pairs of Xi
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * NR_SLOTS * SLOT_LEN + i * SLOT_LEN + xi_offset);
b = (__global ulong *)
(ht_src + tid * NR_SLOTS * SLOT_LEN + j * SLOT_LEN + xi_offset);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
if (round < 8)
// reset the counter in preparation of the next round
*(__global uint *)(ht_src + tid * NR_SLOTS * SLOT_LEN) = 0;
#ifdef ENABLE_DEBUG
debug[tid * 2] = dropped_coll;
debug[tid * 2 + 1] = dropped_stor;
#endif
}
/*
** This defines kernel_round1, kernel_round2, ..., kernel_round7.
*/
#define KERNEL_ROUND(N) \
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) \
void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
__global uint *debug) \
{ \
equihash_round(N, ht_src, ht_dst, debug); \
}
KERNEL_ROUND(1)
KERNEL_ROUND(2)
KERNEL_ROUND(3)
KERNEL_ROUND(4)
KERNEL_ROUND(5)
KERNEL_ROUND(6)
KERNEL_ROUND(7)
// kernel_round8 takes an extra argument, "sols"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round8(__global char *ht_src, __global char *ht_dst,
__global uint *debug, __global sols_t *sols)
{
uint tid = get_global_id(0);
equihash_round(8, ht_src, ht_dst, debug);
if (!tid)
sols->nr = sols->likely_invalids = 0;
}
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
slot * SLOT_LEN + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = xi_offset_for_round(round);
do
{
ins[j] = expand_ref(ht, xi_offset,
DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
ins[j - 1] = expand_ref(ht, xi_offset,
DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
/*
** Verify if a potential solution is in fact valid.
*/
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= MAX_SOLS)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = PARAM_K - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
/*
** Scan the hash tables to find Equihash solutions.
*/
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (PARAM_K - 1) % 2; // table filled at last round
uint cnt;
uint xi_offset = xi_offset_for_round(PARAM_K - 1);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
// it's ok for the collisions array to be so small, as if it fills up
// the potential solutions are likely invalid (many duplicate inputs)
ulong collisions[5];
uint coll;
#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
// in the final hash table, we are looking for a match on both the bits
// part of the previous PREFIX colliding bits, and the last PREFIX bits.
uint mask = 0xffffff;
#else
#error "unsupported NR_ROWS_LOG"
#endif
a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += SLOT_LEN)
for (j = i + 1, b = a + SLOT_LEN; j < cnt; j++, b += SLOT_LEN)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalids);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

View File

@ -1,555 +0,0 @@
# 1 "input.cl"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/usr/include/stdc-predef.h" 1 3 4
# 1 "<command-line>" 2
# 1 "input.cl"
# 1 "param.h" 1
# 60 "param.h"
typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[2000];
uint values[2000][(1 << 9)];
} sols_t;
# 2 "input.cl" 2
# 36 "input.cl"
__constant ulong blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__kernel
void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
# 80 "input.cl"
uint ht_store(uint round, __global char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
# 111 "input.cl"
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
else
row = ((xi0 & 0xf0000) >> 0) |
((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
}
else if (round == 2)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
}
else if (round == 3 || round == 4)
{
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
}
else if (round == 5)
{
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
}
else if (round == 6 || round == 7)
{
*(__global ulong *)(p + 0) = xi0;
}
else if (round == 8)
{
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
# 188 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht,
__global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
v[12] ^= 140 + 4 ;
v[14] ^= -1;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2,
h[0],
h[1],
h[2],
h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (64 - 8)),
(h[4] >> 8) | (h[5] << (64 - 8)),
(h[5] >> 8) | (h[6] << (64 - 8)),
(h[6] >> 8));
input++;
}
}
# 415 "input.cl"
uint xor_and_store(uint round, __global char *ht_dst, uint row,
uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2)
{
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
if (round == 2)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
xi2 = (xi2 >> 8);
}
}
else if (round == 3)
{
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
}
else if (round == 4 || round == 5)
{
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
if (round == 4)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 6)
{
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
if (round == 6)
{
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
}
else if (round == 7 || round == 8)
{
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
if (round == 8)
{
xi0 = (xi0 >> 8);
}
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
__global uint *debug)
{
uint tid = get_global_id(0);
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
# 524 "input.cl"
mask = 0;
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) ==
(first_words[j] & mask))
{
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] =
((ushort)j << 8) | ((ushort)i & 0xff);
}
dropped_stor = 0;
for (n = 0; n < nr_coll; n++)
{
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset);
b = (__global ulong *)
(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
if (round < 8)
*(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
# 585 "input.cl"
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void kernel_round8(__global char *ht_src, __global char *ht_dst,
__global uint *debug, __global sols_t *sols)
{
uint tid = get_global_id(0);
equihash_round(8, ht_src, ht_dst, debug);
if (!tid)
sols->nr = sols->likely_invalids = 0;
}
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 +
slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
}
while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32)
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
if (((*(__global uint *)a) & mask) ==
((*(__global uint *)b) & mask))
{
ref_i = *(__global uint *)(a - 4);
ref_j = *(__global uint *)(b - 4);
if (coll < sizeof (collisions) / sizeof (*collisions))
collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
else
atomic_inc(&sols->likely_invalids);
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32,
collisions[i] & 0xffffffff);
}

View File

@ -1,305 +0,0 @@
#include "ocl_xmp.hpp"
// miner instance
#include "opencl.h"
#include <cstdint>
#include <boost/filesystem.hpp>
// is this really needed?
//#include "uint256.h"
// hardcoded defines, looks like not working
// hardcoded defines fix this
#define RESTBITS 4
#define XINTREE
#define UNROLL
#define __OPENCL_HOST__
#include "zcash/gpu/common.h"
struct MinerInstance {
cl_context _context;
cl_program _program;
cl_command_queue queue;
clBuffer<blake2b_state> blake2bState;
clBuffer<uint32_t> heap0;
clBuffer<uint32_t> heap1;
clBuffer<bsizes> nslots;
clBuffer<proof> sols;
clBuffer<uint32_t> numSols;
cl_kernel _digitHKernel;
cl_kernel _digitOKernel;
cl_kernel _digitEKernel;
cl_kernel _digitKKernel;
cl_kernel _digitKernels[9];
//hide_xmp_hack::uint256 nonce; // TODO IS THIS NEEDED????
bool init(cl_context context, cl_program program, cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
};
cl_context gContext = 0;
cl_program gProgram = 0;
cl_platform_id gPlatform = 0;
bool MinerInstance::init(cl_context context,
cl_program program,
cl_device_id dev,
unsigned int threadsNum,
unsigned int threadsPerBlock)
{
cl_int error;
_context = context;
_program = program;
queue = clCreateCommandQueue(context, dev, 0, &error);
blake2bState.init(context, 1, CL_MEM_READ_WRITE);
heap0.init(context, sizeof(digit0) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS);
heap1.init(context, sizeof(digit1) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS);
nslots.init(context, 2, CL_MEM_READ_WRITE);
sols.init(context, MAXSOLS, CL_MEM_READ_WRITE);
numSols.init(context, 1, CL_MEM_READ_WRITE);
_digitHKernel = clCreateKernel(program, "digitH", &error);
_digitOKernel = clCreateKernel(program, "digitOdd", &error);
_digitEKernel = clCreateKernel(program, "digitEven", &error);
_digitKKernel = clCreateKernel(program, "digitK", &error);
OCLR(clSetKernelArg(_digitHKernel, 0, sizeof(cl_mem), &blake2bState.DeviceData), 1);
OCLR(clSetKernelArg(_digitHKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
OCLR(clSetKernelArg(_digitHKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1);
OCLR(clSetKernelArg(_digitOKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
OCLR(clSetKernelArg(_digitOKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1);
OCLR(clSetKernelArg(_digitOKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1);
OCLR(clSetKernelArg(_digitEKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
OCLR(clSetKernelArg(_digitEKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1);
OCLR(clSetKernelArg(_digitEKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1);
for (unsigned i = 1; i <= 8; i++) {
char kernelName[32];
sprintf(kernelName, "digit_%u", i);
_digitKernels[i] = clCreateKernel(program, kernelName, &error);
OCLR(clSetKernelArg(_digitKernels[i], 0, sizeof(cl_mem), &heap0.DeviceData), 1);
OCLR(clSetKernelArg(_digitKernels[i], 1, sizeof(cl_mem), &heap1.DeviceData), 1);
OCLR(clSetKernelArg(_digitKernels[i], 2, sizeof(cl_mem), &nslots.DeviceData), 1);
}
OCLR(clSetKernelArg(_digitKKernel, 0, sizeof(cl_mem), &heap0.DeviceData), 1);
OCLR(clSetKernelArg(_digitKKernel, 1, sizeof(cl_mem), &heap1.DeviceData), 1);
OCLR(clSetKernelArg(_digitKKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1);
OCLR(clSetKernelArg(_digitKKernel, 3, sizeof(cl_mem), &sols.DeviceData), 1);
OCLR(clSetKernelArg(_digitKKernel, 4, sizeof(cl_mem), &numSols.DeviceData), 1);
return true;
}
////////////////////////////
////statics non class START
static void setheader(blake2b_state *ctx, const char *header, const uint32_t headerlen)
{
uint32_t le_N = WN;
uint32_t le_K = WK;
char personal[] = "ZcashPoW01230123";
memcpy(personal + 8, &le_N, 4);
memcpy(personal + 12, &le_K, 4);
blake2b_param P[1];
P->digest_length = HASHOUT;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
P->node_offset = 0;
P->node_depth = 0;
P->inner_length = 0;
memset(P->reserved, 0, sizeof(P->reserved));
memset(P->salt, 0, sizeof(P->salt));
memcpy(P->personal, (const uint8_t *)personal, 16);
blake2b_init_param(ctx, P);
blake2b_update(ctx, (const uint8_t*)header, headerlen);
}
static void setnonce(blake2b_state *ctx, const uint8_t *nonce)
{
blake2b_update(ctx, nonce, 32);
}
static int inline digit(cl_command_queue clQueue, cl_kernel kernel, size_t nthreads, size_t threadsPerBlock)
{
size_t globalSize[] = { nthreads, 1, 1 };
size_t localSize[] = { threadsPerBlock, 1 };
OCLR(clEnqueueNDRangeKernel(clQueue, kernel, 1, 0, globalSize, localSize, 0, 0, 0), 1);
return 0;
}
////statics non class END
////////////////////////////
ocl_xmp::ocl_xmp(int platf_id, int dev_id) { /*TODO*/
platform_id = platf_id;
device_id = dev_id;
// TODO
threadsNum = 8192;
wokrsize = 128; // 256;
//threadsperblock = 128;
}
std::string ocl_xmp::getdevinfo() { /*TODO*/
return "GPU_ID(" + std::to_string(device_id) + ")";
}
// STATICS START
int ocl_xmp::getcount() { /*TODO*/
return 0;
}
void ocl_xmp::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ }
void ocl_xmp::start(ocl_xmp& device_context) {
/*TODO*/
device_context.is_init_success = false;
cl_context gContext[64] = { 0 };
cl_program gProgram[64] = { 0 };
std::vector<cl_device_id> allGpus;
if (!clInitialize(device_context.platform_id, allGpus)) {
return;
}
// this is kinda stupid but it works
std::vector<cl_device_id> gpus;
for (unsigned i = 0; i < allGpus.size(); ++i) {
if (i == device_context.device_id) {
printf("Using device %d as GPU %d\n", i, (int)gpus.size());
gpus.push_back(allGpus[i]);
}
}
if (!gpus.size()){
printf("Device id %d not found\n", device_context.device_id);
return;
}
// context create
for (unsigned i = 0; i < gpus.size(); i++) {
cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatform, 0 };
cl_int error;
gContext[i] = clCreateContext(props, 1, &gpus[i], 0, 0, &error);
//OCLR(error, false);
if (cl_int err = error) {
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
return;
}
}
std::vector<cl_int> binstatus;
binstatus.resize(gpus.size());
for (size_t i = 0; i < gpus.size(); i++) {
char kernelName[64];
sprintf(kernelName, "equiw200k9_gpu%u.bin", (unsigned)i);
if (!clCompileKernel(gContext[i],
gpus[i],
kernelName,
{ "zcash/gpu/equihash.cl" },
"-I./zcash/gpu -DXINTREE -DWN=200 -DWK=9 -DRESTBITS=4 -DUNROLL",
&binstatus[i],
&gProgram[i])) {
return;
}
}
for (unsigned i = 0; i < gpus.size(); ++i) {
if (binstatus[i] == CL_SUCCESS) {
device_context.context = new MinerInstance();
if (!device_context.context->init(gContext[i], gProgram[i], gpus[i], device_context.threadsNum, device_context.wokrsize)) {
printf("Init failed");
return;
}
}
else {
printf("GPU %d: failed to load kernel\n", i);
return;
}
}
device_context.is_init_success = true;
}
void ocl_xmp::stop(ocl_xmp& device_context) { /*TODO*/ }
void ocl_xmp::solve(const char *tequihash_header,
unsigned int tequihash_header_len,
const char* nonce,
unsigned int nonce_len,
std::function<bool()> cancelf,
std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
std::function<void(void)> hashdonef,
ocl_xmp& device_context) {
if (device_context.is_init_success == false) {
printf("fail OCL\n");
//cancelf();
return;
}
// move to context or somewhere or leave?
blake2b_state initialCtx;
setheader(&initialCtx, tequihash_header, tequihash_header_len);
MinerInstance *miner = device_context.context;
clFlush(miner->queue);
/*hide_xmp_hack::uint256 nNonce = hide_xmp_hack::uint256(nonce);
miner->nonce = nNonce;*/
*miner->blake2bState.HostData = initialCtx;
setnonce(miner->blake2bState.HostData, (const uint8_t*)nonce);
memset(miner->nslots.HostData, 0, 2 * sizeof(bsizes));
*miner->numSols.HostData = 0;
miner->blake2bState.copyToDevice(miner->queue, false);
miner->nslots.copyToDevice(miner->queue, false);
miner->numSols.copyToDevice(miner->queue, false);
digit(miner->queue, miner->_digitHKernel, device_context.threadsNum, device_context.wokrsize);
#if BUCKBITS == 16 && RESTBITS == 4 && defined XINTREE && defined(UNROLL)
for (unsigned i = 1; i <= 8; i++)
digit(miner->queue, miner->_digitKernels[i], device_context.threadsNum, device_context.wokrsize);
#else
size_t globalSize[] = { _threadsNum, 1, 1 };
size_t localSize[] = { _threadsPerBlocksNum, 1 };
for (unsigned r = 1; r < WK; r++) {
if (r & 1) {
OCL(clSetKernelArg(miner->_digitOKernel, 0, sizeof(cl_uint), &r));
OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitOKernel, 1, 0, globalSize, localSize, 0, 0, 0));
}
else {
OCL(clSetKernelArg(miner->_digitEKernel, 0, sizeof(cl_uint), &r));
OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitEKernel, 1, 0, globalSize, localSize, 0, 0, 0));
}
}
#endif
digit(miner->queue, miner->_digitKKernel, device_context.threadsNum, device_context.wokrsize);
// get solutions
miner->sols.copyToHost(miner->queue, true);
miner->numSols.copyToHost(miner->queue, true);
for (unsigned s = 0; s < miner->numSols.HostData[0]; s++)
{
std::vector<uint32_t> index_vector(PROOFSIZE);
for (u32 i = 0; i < PROOFSIZE; i++) {
index_vector[i] = miner->sols[s][i];
}
solutionf(index_vector, DIGITBITS, nullptr);
if (cancelf()) return;
}
hashdonef();
}
// STATICS END

View File

@ -1,56 +0,0 @@
#pragma once
#ifdef _LIB
#define DLL_OCL_XMP __declspec(dllexport)
#else
#define DLL_OCL_XMP
#endif
// remove after
#include <string>
#include <functional>
#include <vector>
#include <cstdint>
struct MinerInstance;
struct DLL_OCL_XMP ocl_xmp
{
//int threadsperblock;
int blocks;
int device_id;
int platform_id;
MinerInstance* context;
// threads
unsigned threadsNum; // TMP
unsigned wokrsize;
bool is_init_success = false;
ocl_xmp(int platf_id, int dev_id);
std::string getdevinfo();
static int getcount();
static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
static void start(ocl_xmp& device_context);
static void stop(ocl_xmp& device_context);
static void solve(const char *tequihash_header,
unsigned int tequihash_header_len,
const char* nonce,
unsigned int nonce_len,
std::function<bool()> cancelf,
std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
std::function<void(void)> hashdonef,
ocl_xmp& device_context);
std::string getname() { return "OCL_XMP"; }
private:
std::string m_gpu_name;
std::string m_version;
};

View File

@ -1,100 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClInclude Include="ocl_xmp.hpp" />
<ClInclude Include="zcash\gpu\common.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\cpu_tromp\blake2\blake2bx.cpp" />
<ClCompile Include="ocl_xmp.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="zcash\gpu\equihash.cl" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{5EC9EDEB-8E49-4126-9161-1560683CBC71}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>ocl_xpm</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>
</SDLCheck>
<AdditionalIncludeDirectories>..\ocl_device_utils;..\cpu_tromp;..\3rdparty\include;$(AMDAPPSDKROOT)\include\</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>
</SDLCheck>
<AdditionalIncludeDirectories>..\ocl_device_utils;..\3rdparty\include;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -1,26 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="zcash">
<UniqueIdentifier>{69f1aa4c-1be3-4265-a93c-b58266bad10b}</UniqueIdentifier>
</Filter>
<Filter Include="zcash\gpu">
<UniqueIdentifier>{a95c2e64-90c0-48d9-9287-46723392025d}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="ocl_xmp.hpp" />
<ClInclude Include="zcash\gpu\common.h">
<Filter>zcash\gpu</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="ocl_xmp.cpp" />
<ClCompile Include="..\cpu_tromp\blake2\blake2bx.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="zcash\gpu\equihash.cl">
<Filter>zcash\gpu</Filter>
</None>
</ItemGroup>
</Project>

View File

@ -1,150 +0,0 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions
*/
// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
// return make_uint2(a.x ^ b.x, a.y ^ b.y);
// }
// uint2 ROR/ROL methods
uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
return result;
}
uint2 SWAPUINT2(uint2 value) {
uint2 result;
result.x = value.y;
result.y = value.x;
return result;
// return make_uint2(value.y, value.x);
}
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
__constant int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
*a += *b + m[ blake2b_sigma[r][2*i] ];
((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
*a += *b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
*c += *d;
((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
const uint32_t leb = idx;
*(uint32_t*)(state->buf + state->buflen) = leb;
state->buflen += 4;
state->counter += state->buflen;
for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
state->buf[i+state->buflen] = 0;
uint64_t *d_data = (uint64_t *)state->buf;
uint64_t m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
m[9] = d_data[9];
m[10] = d_data[10];
m[11] = d_data[11];
m[12] = d_data[12];
m[13] = d_data[13];
m[14] = d_data[14];
m[15] = d_data[15];
uint64_t v[16];
v[0] = state->h[0];
v[1] = state->h[1];
v[2] = state->h[2];
v[3] = state->h[3];
v[4] = state->h[4];
v[5] = state->h[5];
v[6] = state->h[6];
v[7] = state->h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ state->counter;
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
state->h[0] ^= v[0] ^ v[ 8];
state->h[1] ^= v[1] ^ v[ 9];
state->h[2] ^= v[2] ^ v[10];
state->h[3] ^= v[3] ^ v[11];
state->h[4] ^= v[4] ^ v[12];
state->h[5] ^= v[5] ^ v[13];
state->h[6] ^= v[6] ^ v[14];
state->h[7] ^= v[7] ^ v[15];
for (unsigned i = 0; i < outlen; i++)
hash[i] = ((uint8_t*)state->h)[i];
}

View File

@ -1,159 +0,0 @@
#if defined(__OPENCL_HOST__)
#define __global
//#include "blake2/blake2.h"
//#include "equi.h"
#include "../cpu_tromp/equi.h"
#else
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
typedef long int64_t;
typedef ulong uint64_t;
#if defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
ALIGN( 64 ) typedef struct __blake2b_state {
uint64_t h[8];
uint8_t buf[BLAKE2B_BLOCKBYTES];
uint16_t counter;
uint8_t buflen;
uint8_t lastblock;
} blake2b_state;
#pragma pack(pop)
#endif
#define COLLISION_BIT_LENGTH (WN / (WK+1))
#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
#define NDIGITS (WK+1)
#define DIGITBITS (WN/(NDIGITS))
//#define PROOFSIZE (1u<<WK)
#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
//#define BASE (1u<<DIGITBITS)
//#define NHASHES (2u*BASE)
//#define HASHESPERBLAKE (512/WN)
//#define HASHOUT (HASHESPERBLAKE*WN/8)
// 2_log of number of buckets
#define BUCKBITS (DIGITBITS-RESTBITS)
// number of buckets
#define NBUCKETS (1<<BUCKBITS)
// 2_log of number of slots per bucket
#define SLOTBITS (RESTBITS+1+1)
// number of slots per bucket
#define NSLOTS (1u<<SLOTBITS)
// number of per-xhash slots
#define XFULL 16
// SLOTBITS mask
#define SLOTMASK (NSLOTS-1)
// number of possible values of xhash (rest of n) bits
#define NRESTS (1u<<RESTBITS)
// number of blocks of hashes extracted from single 512 bit blake2b output
#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
// nothing larger found in 100000 runs
#define MAXSOLS 8
#define WORDS(bits) ((bits + 31) / 32)
#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
typedef uint32_t proof[PROOFSIZE];
// tree = | xhash(RESTBITS) | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
typedef uint32_t tree;
typedef union hashunit {
uint32_t word;
uint8_t bytes[4];
} hashunit;
typedef struct slot0 {
tree attr;
hashunit hash[HASHWORDS0];
} slot0;
typedef struct slot1 {
tree attr;
hashunit hash[HASHWORDS1];
} slot1;
// a bucket is NSLOTS treenodes
typedef slot0 bucket0[NSLOTS];
typedef slot1 bucket1[NSLOTS];
// the N-bit hash consists of K+1 n-bit "digits"
// each of which corresponds to a layer of NBUCKETS buckets
typedef bucket0 digit0[NBUCKETS];
typedef bucket1 digit1[NBUCKETS];
// manages hash and tree data
typedef struct htalloc {
__global bucket0 *trees0[(WK+1)/2];
__global bucket1 *trees1[WK/2];
} htalloc;
typedef uint32_t bsizes[NBUCKETS];
typedef struct htlayout {
htalloc hta;
uint32_t prevhashunits;
uint32_t nexthashunits;
uint32_t dunits;
uint32_t prevbo;
uint32_t nextbo;
} htlayout;
#if RESTBITS <= 6
typedef uint8_t xslot;
#else
typedef uint16_t xslot;
#endif
typedef struct collisiondata {
#ifdef XBITMAP
#if NSLOTS > 64
#error cant use XBITMAP with more than 64 slots
#endif
uint64_t xhashmap[NRESTS];
uint64_t xmap;
#else
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
uint32_t n0;
uint32_t n1;
#endif
uint32_t s0;
} collisiondata;
typedef struct equi {
blake2b_state blake_ctx;
htalloc hta;
__global bsizes *nslots;
__global proof *sols;
uint32_t nsols;
uint32_t nthreads;
} equi;

File diff suppressed because it is too large Load Diff