From 525a16a1ff04dbb54a9669487d8bbbc83d252f11 Mon Sep 17 00:00:00 2001 From: John Tromp Date: Fri, 11 Nov 2016 23:17:01 -0500 Subject: [PATCH] replace USE_AVX2 by NBLAKES; try x8 as well as x4 --- Makefile | 21 +++++++------ blake2-avx2/blake2bip.c | 70 +++++++++++++++++++++++++++++++++++++++++ equi_miner.cpp | 7 +---- equi_miner.h | 25 ++++++++------- 4 files changed, 97 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 0142a42..125c19d 100644 --- a/Makefile +++ b/Makefile @@ -10,11 +10,14 @@ equi: equi.h equi_miner.h equi_miner.cpp Makefile equi1: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1 -eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2 +equix4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4 -eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21 +equix41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 + +equix81: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g @@ -25,11 +28,11 @@ eq1445: equi.h equi_miner.h equi_miner.cpp Makefile eq14451: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o eq14451 -eq1445avx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DATOMIC -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx2 +eq1445x4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -DATOMIC -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x4 -eq1445avx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx21 +eq1445x41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41 dev: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile $(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev @@ -83,4 +86,4 @@ blake2-asm/asm/zcblake2_avx2.o: make -C blake2-asm clean: - make -C blake2b clean && rm -f dev dev1 equi equi1 eqavx2 eqavx21 equi1g eq1445 eq14451 eq1445avx2 eq1445avx21 eqcuda eqcuda1445 verify + make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify diff --git a/blake2-avx2/blake2bip.c b/blake2-avx2/blake2bip.c index e8288de..f701e1a 100644 --- a/blake2-avx2/blake2bip.c +++ b/blake2-avx2/blake2bip.c @@ -202,6 +202,28 @@ ALIGN(64) static const uint32_t indices[12][16] = { BLAKE2B_G_V4(m, r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ } while(0) +#define BLAKE2B_G_V8(m, r, i, a, b, c, d, e, f, g, h) do { \ + a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+0])); \ + e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+0])); \ + a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT32(d); h = ROT32(h); \ + c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT24(b); f = ROT24(f); \ + a = ADD(a, LOAD((uint8_t const *)(m ) + blake2b_sigma[r][2*i+1])); \ + e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+1])); \ + a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT16(d); h = ROT16(h); \ + c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT63(b); f = ROT63(f); \ +} while(0) + +#define BLAKE2B_ROUND_V8(v, m, r) do { \ + BLAKE2B_G_V8(m, r, 0, v[ 0], v[ 4], v[ 8], v[12], v[16], v[20], v[24], v[28]); \ + BLAKE2B_G_V8(m, r, 1, v[ 1], v[ 5], v[ 9], v[13], v[17], v[21], v[25], v[29]); \ + BLAKE2B_G_V8(m, r, 2, v[ 2], v[ 6], v[10], v[14], v[18], v[22], v[26], v[30]); \ + BLAKE2B_G_V8(m, r, 3, v[ 3], v[ 7], v[11], v[15], v[19], v[23], v[27], v[31]); \ + BLAKE2B_G_V8(m, r, 4, v[ 0], v[ 5], v[10], v[15], v[16], v[21], v[26], v[31]); \ + BLAKE2B_G_V8(m, r, 5, v[ 1], v[ 6], v[11], v[12], v[17], v[22], v[27], v[28]); \ + BLAKE2B_G_V8(m, r, 6, v[ 2], v[ 7], v[ 8], v[13], v[18], v[23], v[24], v[29]); \ + BLAKE2B_G_V8(m, r, 7, v[ 3], v[ 4], v[ 9], v[14], v[19], v[20], v[25], v[30]); \ +} while(0) + #if defined(PERMUTE_WITH_GATHER) #define BLAKE2B_LOADMSG_V4(w, m) do { \ int i; \ @@ -345,3 +367,51 @@ void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) { memcpy(out, s, 256); } + +void blake2bx8_final(const blake2b_state *S, uchar *out, u32 blockidx) { +#if 0 + blake2bx4_final(S, out, 2*blockidx); + blake2bx4_final(S, out+256, 2*blockidx+1); +#else + __m256i v[32], s[16], iv[8], w[32], counter, flag; + uint32_t b, i, r; + + ALIGN(64) uint8_t buffer[8 * BLAKE2B_BLOCKBYTES]; + memset(buffer, 0, 8 * BLAKE2B_BLOCKBYTES); + for (i = 0; i < 8; i++) { + memcpy(buffer+128*i, S->buf, S->buflen); + b = htole32(8 * blockidx + i); + memcpy(buffer+128*i + S->buflen, &b, sizeof(uint32_t)); + } + + for(i = 0; i < 8; ++i) { + v[16+i] = v[i] = iv[i] = _mm256_set1_epi64x(S->h[i]); + } + + counter = _mm256_set1_epi64x(128 + S->buflen + sizeof(uint32_t)); + flag = _mm256_set1_epi64x(~0); + + v[24] = v[ 8] = _mm256_set1_epi64x(blake2b_IV[0]); + v[25] = v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]); + v[26] = v[10] = _mm256_set1_epi64x(blake2b_IV[2]); + v[27] = v[11] = _mm256_set1_epi64x(blake2b_IV[3]); + v[28] = v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter); + v[29] = v[13] = _mm256_set1_epi64x(blake2b_IV[5]); + v[30] = v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag); + v[31] = v[15] = _mm256_set1_epi64x(blake2b_IV[7]); + BLAKE2B_LOADMSG_V4(w, buffer); + BLAKE2B_LOADMSG_V4((w+16), (buffer+512)); + for(r = 0; r < 12; ++r) { + BLAKE2B_ROUND_V8(v, w, r); + } + for(i = 0; i < 8; ++i) { + v[ i] = XOR(XOR(v[ i], v[ 8+i]), iv[i]); + v[16+i] = XOR(XOR(v[16+i], v[24+i]), iv[i]); + } + + BLAKE2B_UNPACK_STATE_V4( s , v ); + BLAKE2B_UNPACK_STATE_V4((s+8), (v+16)); + + memcpy(out, s, 8*64); +#endif +} diff --git a/equi_miner.cpp b/equi_miner.cpp index 63a7f03..0613cee 100644 --- a/equi_miner.cpp +++ b/equi_miner.cpp @@ -62,12 +62,7 @@ int main(int argc, char **argv) { thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx)); assert(threads); equi eq(nthreads); - printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000); -#ifdef USE_AVX2 - printf(" and AVX2 intrinsics to compute 4-way blake2b\n"); -#else - printf(" and no AVX2\n"); -#endif + printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES); u32 sumnsols = 0; char headernonce[HEADERNONCELEN]; u32 hdrlen = strlen(header); diff --git a/equi_miner.h b/equi_miner.h index 69be380..1f04c50 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -580,31 +580,34 @@ struct equi { } }; -#ifdef USE_AVX2 -static const u32 BLAKESINPARALLEL = 4; -#else -static const u32 BLAKESINPARALLEL = 1; +#ifndef NBLAKES +#define NBLAKES 1 #endif -// number of hashes extracted from BLAKESINPARALLEL blake2b outputs -static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE; + +// number of hashes extracted from NBLAKES blake2b outputs +static const u32 HASHESPERBLOCK = NBLAKES*HASHESPERBLAKE; // number of blocks of parallel blake2b calls static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; void digit0(const u32 id) { htlayout htl(this, 0); const u32 hashbytes = hashsize(0); - uchar hashes[BLAKESINPARALLEL * 64]; + uchar hashes[NBLAKES * 64]; blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster for (u32 block = id; block < NBLOCKS; block += nthreads) { -#ifdef USE_AVX2 +#if NBLAKES == 4 blake2bx4_final(&state0, hashes, block); -#else +#elif NBLAKES == 8 + blake2bx8_final(&state0, hashes, block); +#elif NBLAKES == 1 blake2b_state state = state0; // make another copy since blake2b_final modifies it u32 leb = htole32(block); blake2b_update(&state, (uchar *)&leb, sizeof(u32)); blake2b_final(&state, hashes, HASHOUT); +#else +#error not implemented #endif - for (u32 i = 0; ibytes-hashbytes, ph+WN/8-hashbytes, hashbytes); // round 0 tags store hash-generating index - s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j); + s->tag = tree((block * NBLAKES + i) * HASHESPERBLAKE + j); } } }