From 94736965d79560a74a9f6c36a9ad180469b6700a Mon Sep 17 00:00:00 2001 From: John Tromp Date: Thu, 10 Nov 2016 21:53:43 -0500 Subject: [PATCH 1/3] optimize dupe testing in dev_miner as well --- dev_miner.h | 39 ++++++++++++++++----------------------- equi_miner.h | 6 ++---- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/dev_miner.h b/dev_miner.h index 8c9c83e..1e742e5 100644 --- a/dev_miner.h +++ b/dev_miner.h @@ -347,46 +347,39 @@ struct equi { } return false; } - // if dupes != 0, list indices in arbitrary order and return true if dupe found - // if dupes == 0, order indices as in Wagner condition - bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) { + // order indices as in Wagner condition, + // and return true if a left and right subtree have identical leftmost leaves + bool listindices0(u32 r, const tree t, u32 *indices) { if (r == 0) { - u32 idx = t.getindex(); - if (dupes) { - u32 bin = idx & (PROOFSIZE-1); - if (idx == dupes[bin]) return true; - dupes[bin] = idx; - } - *indices = idx; + *indices = t.getindex(); return false; } const slot1 *buck = hta.heap1[t.bucketid()]; const u32 size = 1 << --r; u32 tagi = hashwords(hashsize(r)); - return listindices1(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); + return listindices1(r, buck[t.slotid0()][tagi].tag, indices) + || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size) + || orderindices(indices, size) || indices[0] == indices[size]; } - bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) { + bool listindices1(u32 r, const tree t, u32 *indices) { const slot0 *buck = hta.heap0[t.bucketid()]; const u32 size = 1 << --r; u32 tagi = hashwords(hashsize(r)); - return listindices0(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); + return listindices0(r, buck[t.slotid0()][tagi].tag, indices) + || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size) + || orderindices(indices, size) || indices[0] == indices[size]; } void candidate(const tree t) { - proof prf, dupes; - memset(dupes, 0xffff, sizeof(proof)); - if (listindices1(WK, t, prf, dupes)) return; // assume WK odd - qsort(prf, PROOFSIZE, sizeof(u32), &compu32); - for (u32 i=1; i Date: Thu, 10 Nov 2016 22:06:34 -0500 Subject: [PATCH 2/3] remove diagnostic output --- equi_miner.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/equi_miner.cu b/equi_miner.cu index 60ed138..4204ea6 100644 --- a/equi_miner.cu +++ b/equi_miner.cu @@ -985,7 +985,6 @@ int main(int argc, char **argv) { cudaEventElapsedTime(&duration, start, stop); printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f); - printf("%d candidate solutions\n", eq.nsols); u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols); for (s = nsols = 0; s < maxsols; s++) { if (duped(sols[s])) { From 525a16a1ff04dbb54a9669487d8bbbc83d252f11 Mon Sep 17 00:00:00 2001 From: John Tromp Date: Fri, 11 Nov 2016 23:17:01 -0500 Subject: [PATCH 3/3] replace USE_AVX2 by NBLAKES; try x8 as well as x4 --- Makefile | 21 +++++++------ blake2-avx2/blake2bip.c | 70 +++++++++++++++++++++++++++++++++++++++++ equi_miner.cpp | 7 +---- equi_miner.h | 25 ++++++++------- 4 files changed, 97 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 0142a42..125c19d 100644 --- a/Makefile +++ b/Makefile @@ -10,11 +10,14 @@ equi: equi.h equi_miner.h equi_miner.cpp Makefile equi1: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1 -eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2 +equix4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4 -eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21 +equix41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 + +equix81: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g @@ -25,11 +28,11 @@ eq1445: equi.h equi_miner.h equi_miner.cpp Makefile eq14451: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o eq14451 -eq1445avx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DATOMIC -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx2 +eq1445x4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -DATOMIC -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x4 -eq1445avx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx21 +eq1445x41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41 dev: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile $(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev @@ -83,4 +86,4 @@ blake2-asm/asm/zcblake2_avx2.o: make -C blake2-asm clean: - make -C blake2b clean && rm -f dev dev1 equi equi1 eqavx2 eqavx21 equi1g eq1445 eq14451 eq1445avx2 eq1445avx21 eqcuda eqcuda1445 verify + make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify diff --git a/blake2-avx2/blake2bip.c b/blake2-avx2/blake2bip.c index e8288de..f701e1a 100644 --- a/blake2-avx2/blake2bip.c +++ b/blake2-avx2/blake2bip.c @@ -202,6 +202,28 @@ ALIGN(64) static const uint32_t indices[12][16] = { BLAKE2B_G_V4(m, r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ } while(0) +#define BLAKE2B_G_V8(m, r, i, a, b, c, d, e, f, g, h) do { \ + a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+0])); \ + e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+0])); \ + a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT32(d); h = ROT32(h); \ + c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT24(b); f = ROT24(f); \ + a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+1])); \ + e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+1])); \ + a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT16(d); h = ROT16(h); \ + c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT63(b); f = ROT63(f); \ +} while(0) + +#define BLAKE2B_ROUND_V8(v, m, r) do { \ + BLAKE2B_G_V8(m, r, 0, v[ 0], v[ 4], v[ 8], v[12], v[16], v[20], v[24], v[28]); \ + BLAKE2B_G_V8(m, r, 1, v[ 1], v[ 5], v[ 9], v[13], v[17], v[21], v[25], v[29]); \ + BLAKE2B_G_V8(m, r, 2, v[ 2], v[ 6], v[10], v[14], v[18], v[22], v[26], v[30]); \ + BLAKE2B_G_V8(m, r, 3, v[ 3], v[ 7], v[11], v[15], v[19], v[23], v[27], v[31]); \ + BLAKE2B_G_V8(m, r, 4, v[ 0], v[ 5], v[10], v[15], v[16], v[21], v[26], v[31]); \ + BLAKE2B_G_V8(m, r, 5, v[ 1], v[ 6], v[11], v[12], v[17], v[22], v[27], v[28]); \ + BLAKE2B_G_V8(m, r, 6, v[ 2], v[ 7], v[ 8], v[13], v[18], v[23], v[24], v[29]); \ + BLAKE2B_G_V8(m, r, 7, v[ 3], v[ 4], v[ 9], v[14], v[19], v[20], v[25], v[30]); \ +} while(0) + #if defined(PERMUTE_WITH_GATHER) #define BLAKE2B_LOADMSG_V4(w, m) do { \ int i; \ @@ -345,3 +367,51 @@ void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) { memcpy(out, s, 256); } + +void blake2bx8_final(const blake2b_state *S, uchar *out, u32 blockidx) { +#if 0 + blake2bx4_final(S, out, 2*blockidx); + blake2bx4_final(S, out+256, 2*blockidx+1); +#else + __m256i v[32], s[16], iv[8], w[32], counter, flag; + uint32_t b, i, r; + + ALIGN(64) uint8_t buffer[8 * BLAKE2B_BLOCKBYTES]; + memset(buffer, 0, 8 * BLAKE2B_BLOCKBYTES); + for (i = 0; i < 8; i++) { + memcpy(buffer+128*i, S->buf, S->buflen); + b = htole32(8 * blockidx + i); + memcpy(buffer+128*i + S->buflen, &b, sizeof(uint32_t)); + } + + for(i = 0; i < 8; ++i) { + v[16+i] = v[i] = iv[i] = _mm256_set1_epi64x(S->h[i]); + } + + counter = _mm256_set1_epi64x(128 + S->buflen + sizeof(uint32_t)); + flag = _mm256_set1_epi64x(~0); + + v[24] = v[ 8] = _mm256_set1_epi64x(blake2b_IV[0]); + v[25] = v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]); + v[26] = v[10] = _mm256_set1_epi64x(blake2b_IV[2]); + v[27] = v[11] = _mm256_set1_epi64x(blake2b_IV[3]); + v[28] = v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter); + v[29] = v[13] = _mm256_set1_epi64x(blake2b_IV[5]); + v[30] = v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag); + v[31] = v[15] = _mm256_set1_epi64x(blake2b_IV[7]); + BLAKE2B_LOADMSG_V4(w, buffer); + BLAKE2B_LOADMSG_V4((w+16), (buffer+512)); + for(r = 0; r < 12; ++r) { + BLAKE2B_ROUND_V8(v, w, r); + } + for(i = 0; i < 8; ++i) { + v[ i] = XOR(XOR(v[ i], v[ 8+i]), iv[i]); + v[16+i] = XOR(XOR(v[16+i], v[24+i]), iv[i]); + } + + BLAKE2B_UNPACK_STATE_V4( s , v ); + BLAKE2B_UNPACK_STATE_V4((s+8), (v+16)); + + memcpy(out, s, 8*64); +#endif +} diff --git a/equi_miner.cpp b/equi_miner.cpp index 63a7f03..0613cee 100644 --- a/equi_miner.cpp +++ b/equi_miner.cpp @@ -62,12 +62,7 @@ int main(int argc, char **argv) { thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx)); assert(threads); equi eq(nthreads); - printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000); -#ifdef USE_AVX2 - printf(" and AVX2 intrinsics to compute 4-way blake2b\n"); -#else - printf(" and no AVX2\n"); -#endif + printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES); u32 sumnsols = 0; char headernonce[HEADERNONCELEN]; u32 hdrlen = strlen(header); diff --git a/equi_miner.h b/equi_miner.h index 69be380..1f04c50 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -580,31 +580,34 @@ struct equi { } }; -#ifdef USE_AVX2 -static const u32 BLAKESINPARALLEL = 4; -#else -static const u32 BLAKESINPARALLEL = 1; +#ifndef NBLAKES +#define NBLAKES 1 #endif -// number of hashes extracted from BLAKESINPARALLEL blake2b outputs -static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE; + +// number of hashes extracted from NBLAKES blake2b outputs +static const u32 HASHESPERBLOCK = NBLAKES*HASHESPERBLAKE; // number of blocks of parallel blake2b calls static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; void digit0(const u32 id) { htlayout htl(this, 0); const u32 hashbytes = hashsize(0); - uchar hashes[BLAKESINPARALLEL * 64]; + uchar hashes[NBLAKES * 64]; blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster for (u32 block = id; block < NBLOCKS; block += nthreads) { -#ifdef USE_AVX2 +#if NBLAKES == 4 blake2bx4_final(&state0, hashes, block); -#else +#elif NBLAKES == 8 + blake2bx8_final(&state0, hashes, block); +#elif NBLAKES == 1 blake2b_state state = state0; // make another copy since blake2b_final modifies it u32 leb = htole32(block); blake2b_update(&state, (uchar *)&leb, sizeof(u32)); blake2b_final(&state, hashes, HASHOUT); +#else +#error not implemented #endif - for (u32 i = 0; ibytes-hashbytes, ph+WN/8-hashbytes, hashbytes); // round 0 tags store hash-generating index - s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j); + s->tag = tree((block * NBLAKES + i) * HASHESPERBLAKE + j); } } }