Merge branch 'master' of github.com:tromp/equihash

merge whatever
This commit is contained in:
tromp 2016-11-12 19:12:04 -05:00
commit dda74a9dd7
6 changed files with 115 additions and 54 deletions

View File

@ -10,11 +10,14 @@ equi: equi.h equi_miner.h equi_miner.cpp Makefile
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
equix4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4
eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21
equix41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41
equix81: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
@ -25,11 +28,11 @@ eq1445: equi.h equi_miner.h equi_miner.cpp Makefile
eq14451: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o eq14451
eq1445avx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DATOMIC -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx2
eq1445x4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DATOMIC -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x4
eq1445avx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx21
eq1445x41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41
dev: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile
$(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev
@ -83,4 +86,4 @@ blake2-asm/asm/zcblake2_avx2.o:
make -C blake2-asm
clean:
make -C blake2b clean && rm -f dev dev1 equi equi1 eqavx2 eqavx21 equi1g eq1445 eq14451 eq1445avx2 eq1445avx21 eqcuda eqcuda1445 verify
make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify

View File

@ -202,6 +202,28 @@ ALIGN(64) static const uint32_t indices[12][16] = {
BLAKE2B_G_V4(m, r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
} while(0)
#define BLAKE2B_G_V8(m, r, i, a, b, c, d, e, f, g, h) do { \
a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+0])); \
e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+0])); \
a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT32(d); h = ROT32(h); \
c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT24(b); f = ROT24(f); \
a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+1])); \
e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+1])); \
a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT16(d); h = ROT16(h); \
c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT63(b); f = ROT63(f); \
} while(0)
#define BLAKE2B_ROUND_V8(v, m, r) do { \
BLAKE2B_G_V8(m, r, 0, v[ 0], v[ 4], v[ 8], v[12], v[16], v[20], v[24], v[28]); \
BLAKE2B_G_V8(m, r, 1, v[ 1], v[ 5], v[ 9], v[13], v[17], v[21], v[25], v[29]); \
BLAKE2B_G_V8(m, r, 2, v[ 2], v[ 6], v[10], v[14], v[18], v[22], v[26], v[30]); \
BLAKE2B_G_V8(m, r, 3, v[ 3], v[ 7], v[11], v[15], v[19], v[23], v[27], v[31]); \
BLAKE2B_G_V8(m, r, 4, v[ 0], v[ 5], v[10], v[15], v[16], v[21], v[26], v[31]); \
BLAKE2B_G_V8(m, r, 5, v[ 1], v[ 6], v[11], v[12], v[17], v[22], v[27], v[28]); \
BLAKE2B_G_V8(m, r, 6, v[ 2], v[ 7], v[ 8], v[13], v[18], v[23], v[24], v[29]); \
BLAKE2B_G_V8(m, r, 7, v[ 3], v[ 4], v[ 9], v[14], v[19], v[20], v[25], v[30]); \
} while(0)
#if defined(PERMUTE_WITH_GATHER)
#define BLAKE2B_LOADMSG_V4(w, m) do { \
int i; \
@ -345,3 +367,51 @@ void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) {
memcpy(out, s, 256);
}
void blake2bx8_final(const blake2b_state *S, uchar *out, u32 blockidx) {
#if 0
blake2bx4_final(S, out, 2*blockidx);
blake2bx4_final(S, out+256, 2*blockidx+1);
#else
__m256i v[32], s[16], iv[8], w[32], counter, flag;
uint32_t b, i, r;
ALIGN(64) uint8_t buffer[8 * BLAKE2B_BLOCKBYTES];
memset(buffer, 0, 8 * BLAKE2B_BLOCKBYTES);
for (i = 0; i < 8; i++) {
memcpy(buffer+128*i, S->buf, S->buflen);
b = htole32(8 * blockidx + i);
memcpy(buffer+128*i + S->buflen, &b, sizeof(uint32_t));
}
for(i = 0; i < 8; ++i) {
v[16+i] = v[i] = iv[i] = _mm256_set1_epi64x(S->h[i]);
}
counter = _mm256_set1_epi64x(128 + S->buflen + sizeof(uint32_t));
flag = _mm256_set1_epi64x(~0);
v[24] = v[ 8] = _mm256_set1_epi64x(blake2b_IV[0]);
v[25] = v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]);
v[26] = v[10] = _mm256_set1_epi64x(blake2b_IV[2]);
v[27] = v[11] = _mm256_set1_epi64x(blake2b_IV[3]);
v[28] = v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter);
v[29] = v[13] = _mm256_set1_epi64x(blake2b_IV[5]);
v[30] = v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag);
v[31] = v[15] = _mm256_set1_epi64x(blake2b_IV[7]);
BLAKE2B_LOADMSG_V4(w, buffer);
BLAKE2B_LOADMSG_V4((w+16), (buffer+512));
for(r = 0; r < 12; ++r) {
BLAKE2B_ROUND_V8(v, w, r);
}
for(i = 0; i < 8; ++i) {
v[ i] = XOR(XOR(v[ i], v[ 8+i]), iv[i]);
v[16+i] = XOR(XOR(v[16+i], v[24+i]), iv[i]);
}
BLAKE2B_UNPACK_STATE_V4( s , v );
BLAKE2B_UNPACK_STATE_V4((s+8), (v+16));
memcpy(out, s, 8*64);
#endif
}

View File

@ -347,46 +347,39 @@ struct equi {
}
return false;
}
// if dupes != 0, list indices in arbitrary order and return true if dupe found
// if dupes == 0, order indices as in Wagner condition
bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) {
// order indices as in Wagner condition,
// and return true if a left and right subtree have identical leftmost leaves
bool listindices0(u32 r, const tree t, u32 *indices) {
if (r == 0) {
u32 idx = t.getindex();
if (dupes) {
u32 bin = idx & (PROOFSIZE-1);
if (idx == dupes[bin]) return true;
dupes[bin] = idx;
}
*indices = idx;
*indices = t.getindex();
return false;
}
const slot1 *buck = hta.heap1[t.bucketid()];
const u32 size = 1 << --r;
u32 tagi = hashwords(hashsize(r));
return listindices1(r, buck[t.slotid0()][tagi].tag, indices, dupes)
|| listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
|| (!dupes && orderindices(indices, size));
return listindices1(r, buck[t.slotid0()][tagi].tag, indices)
|| listindices1(r, buck[t.slotid1()][tagi].tag, indices+size)
|| orderindices(indices, size) || indices[0] == indices[size];
}
bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) {
bool listindices1(u32 r, const tree t, u32 *indices) {
const slot0 *buck = hta.heap0[t.bucketid()];
const u32 size = 1 << --r;
u32 tagi = hashwords(hashsize(r));
return listindices0(r, buck[t.slotid0()][tagi].tag, indices, dupes)
|| listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
|| (!dupes && orderindices(indices, size));
return listindices0(r, buck[t.slotid0()][tagi].tag, indices)
|| listindices0(r, buck[t.slotid1()][tagi].tag, indices+size)
|| orderindices(indices, size) || indices[0] == indices[size];
}
void candidate(const tree t) {
proof prf, dupes;
memset(dupes, 0xffff, sizeof(proof));
if (listindices1(WK, t, prf, dupes)) return; // assume WK odd
qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
proof prf;
// listindices combines index tree reconstruction with probably dupe test
if (listindices1(WK, t, prf) || duped(prf)) return; // assume WK odd
// and now we have ourselves a genuine solution
#ifdef ATOMIC
u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
#else
u32 soli = nsols++;
#endif
if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd
if (soli < MAXSOLS) memcpy(sols[soli], prf, sizeof(proof));
}
#endif
void showbsizes(u32 r) {

View File

@ -62,12 +62,7 @@ int main(int argc, char **argv) {
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
#ifdef USE_AVX2
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
#else
printf(" and no AVX2\n");
#endif
printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES);
u32 sumnsols = 0;
char headernonce[HEADERNONCELEN];
u32 hdrlen = strlen(header);

View File

@ -985,7 +985,6 @@ int main(int argc, char **argv) {
cudaEventElapsedTime(&duration, start, stop);
printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f);
printf("%d candidate solutions\n", eq.nsols);
u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols);
for (s = nsols = 0; s < maxsols; s++) {
if (duped(sols[s])) {

View File

@ -394,12 +394,10 @@ struct equi {
}
return false;
}
// if dupes != 0, list indices in arbitrary order and return true if dupe found
// if dupes == 0, order indices as in Wagner condition
// listindices combines index tree reconstruction with probably dupe test
bool listindices0(u32 r, const tree t, u32 *indices) {
if (r == 0) {
u32 idx = t.getindex();
*indices = idx;
*indices = t.getindex();
return false;
}
const slot1 *buck = hta.heap1[t.bucketid()];
@ -582,31 +580,34 @@ struct equi {
}
};
#ifdef USE_AVX2
static const u32 BLAKESINPARALLEL = 4;
#else
static const u32 BLAKESINPARALLEL = 1;
#ifndef NBLAKES
#define NBLAKES 1
#endif
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
// number of hashes extracted from NBLAKES blake2b outputs
static const u32 HASHESPERBLOCK = NBLAKES*HASHESPERBLAKE;
// number of blocks of parallel blake2b calls
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
void digit0(const u32 id) {
htlayout htl(this, 0);
const u32 hashbytes = hashsize(0);
uchar hashes[BLAKESINPARALLEL * 64];
uchar hashes[NBLAKES * 64];
blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster
for (u32 block = id; block < NBLOCKS; block += nthreads) {
#ifdef USE_AVX2
#if NBLAKES == 4
blake2bx4_final(&state0, hashes, block);
#else
#elif NBLAKES == 8
blake2bx8_final(&state0, hashes, block);
#elif NBLAKES == 1
blake2b_state state = state0; // make another copy since blake2b_final modifies it
u32 leb = htole32(block);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
blake2b_final(&state, hashes, HASHOUT);
#else
#error not implemented
#endif
for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
for (u32 i = 0; i<NBLAKES; i++) {
for (u32 j = 0; j<HASHESPERBLAKE; j++) {
const uchar *ph = hashes + i * 64 + j * WN/8;
// figure out bucket for this hash by extracting leading BUCKBITS bits
@ -630,7 +631,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
// hash should end right before tag
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
// round 0 tags store hash-generating index
s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
s->tag = tree((block * NBLAKES + i) * HASHESPERBLAKE + j);
}
}
}