From 525a16a1ff04dbb54a9669487d8bbbc83d252f11 Mon Sep 17 00:00:00 2001
From: John Tromp <john.tromp@gmail.com>
Date: Fri, 11 Nov 2016 23:17:01 -0500
Subject: [PATCH] replace USE_AVX2 by NBLAKES; try x8 as well as x4

---
 Makefile                | 21 +++++++------
 blake2-avx2/blake2bip.c | 70 +++++++++++++++++++++++++++++++++++++++++
 equi_miner.cpp          |  7 +----
 equi_miner.h            | 25 ++++++++-------
 4 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index 0142a42..125c19d 100644
--- a/Makefile
+++ b/Makefile
@@ -10,11 +10,14 @@ equi:	equi.h equi_miner.h equi_miner.cpp Makefile
 equi1:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
 
-eqavx2:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
+equix4:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4
 
-eqavx21:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21
+equix41:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41
+
+equix81:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41
 
 equi1g:	equi.h equi_miner.h equi_miner.cpp Makefile
 	g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
@@ -25,11 +28,11 @@ eq1445:	equi.h equi_miner.h equi_miner.cpp Makefile
 eq14451:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o eq14451
 
-eq1445avx2:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -DATOMIC -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx2
+eq1445x4:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -DATOMIC -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x4
 
-eq1445avx21:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx21
+eq1445x41:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41
 
 dev:	equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile
 	$(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev
@@ -83,4 +86,4 @@ blake2-asm/asm/zcblake2_avx2.o:
 	make -C blake2-asm
 
 clean:	
-	make -C blake2b clean && rm -f dev dev1 equi equi1 eqavx2 eqavx21 equi1g eq1445 eq14451 eq1445avx2 eq1445avx21 eqcuda eqcuda1445 verify
+	make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify
diff --git a/blake2-avx2/blake2bip.c b/blake2-avx2/blake2bip.c
index e8288de..f701e1a 100644
--- a/blake2-avx2/blake2bip.c
+++ b/blake2-avx2/blake2bip.c
@@ -202,6 +202,28 @@ ALIGN(64) static const uint32_t indices[12][16] = {
   BLAKE2B_G_V4(m, r, 7, v[ 3], v[ 4], v[ 9], v[14]);  \
 } while(0)
 
+#define BLAKE2B_G_V8(m, r, i, a, b, c, d, e, f, g, h) do { \
+  a = ADD(a, LOAD((uint8_t const *)(m   ) + blake2b_sigma[r][2*i+0]));  \
+  e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+0]));  \
+  a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT32(d); h = ROT32(h); \
+  c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT24(b); f = ROT24(f); \
+  a = ADD(a, LOAD((uint8_t const *)(m   ) + blake2b_sigma[r][2*i+1]));  \
+  e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+1]));  \
+  a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT16(d); h = ROT16(h); \
+  c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT63(b); f = ROT63(f); \
+} while(0)
+
+#define BLAKE2B_ROUND_V8(v, m, r) do {  \
+  BLAKE2B_G_V8(m, r, 0,   v[ 0], v[ 4], v[ 8], v[12],   v[16], v[20], v[24], v[28]);  \
+  BLAKE2B_G_V8(m, r, 1,   v[ 1], v[ 5], v[ 9], v[13],   v[17], v[21], v[25], v[29]);  \
+  BLAKE2B_G_V8(m, r, 2,   v[ 2], v[ 6], v[10], v[14],   v[18], v[22], v[26], v[30]);  \
+  BLAKE2B_G_V8(m, r, 3,   v[ 3], v[ 7], v[11], v[15],   v[19], v[23], v[27], v[31]);  \
+  BLAKE2B_G_V8(m, r, 4,   v[ 0], v[ 5], v[10], v[15],   v[16], v[21], v[26], v[31]);  \
+  BLAKE2B_G_V8(m, r, 5,   v[ 1], v[ 6], v[11], v[12],   v[17], v[22], v[27], v[28]);  \
+  BLAKE2B_G_V8(m, r, 6,   v[ 2], v[ 7], v[ 8], v[13],   v[18], v[23], v[24], v[29]);  \
+  BLAKE2B_G_V8(m, r, 7,   v[ 3], v[ 4], v[ 9], v[14],   v[19], v[20], v[25], v[30]);  \
+} while(0)
+
 #if defined(PERMUTE_WITH_GATHER)
 #define BLAKE2B_LOADMSG_V4(w, m) do {             \
   int i;                                          \
@@ -345,3 +367,51 @@ void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) {
 
   memcpy(out, s, 256);
 }
+
+void blake2bx8_final(const blake2b_state *S, uchar *out, u32 blockidx) {
+#if 0
+  blake2bx4_final(S, out,     2*blockidx);
+  blake2bx4_final(S, out+256, 2*blockidx+1);
+#else
+  __m256i v[32], s[16], iv[8], w[32], counter, flag;
+  uint32_t b, i, r;
+
+  ALIGN(64) uint8_t buffer[8 * BLAKE2B_BLOCKBYTES];
+  memset(buffer, 0, 8 * BLAKE2B_BLOCKBYTES);
+  for (i = 0; i < 8; i++) {
+    memcpy(buffer+128*i, S->buf, S->buflen);
+    b = htole32(8 * blockidx + i);
+    memcpy(buffer+128*i + S->buflen, &b, sizeof(uint32_t));
+  }
+
+  for(i = 0; i < 8; ++i) {
+    v[16+i] = v[i] = iv[i] = _mm256_set1_epi64x(S->h[i]);
+  }
+
+  counter = _mm256_set1_epi64x(128 + S->buflen + sizeof(uint32_t));
+  flag    = _mm256_set1_epi64x(~0);
+
+  v[24] = v[ 8] = _mm256_set1_epi64x(blake2b_IV[0]);
+  v[25] = v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]);
+  v[26] = v[10] = _mm256_set1_epi64x(blake2b_IV[2]);
+  v[27] = v[11] = _mm256_set1_epi64x(blake2b_IV[3]);
+  v[28] = v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter);
+  v[29] = v[13] = _mm256_set1_epi64x(blake2b_IV[5]);
+  v[30] = v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag);
+  v[31] = v[15] = _mm256_set1_epi64x(blake2b_IV[7]);
+  BLAKE2B_LOADMSG_V4(w, buffer);
+  BLAKE2B_LOADMSG_V4((w+16), (buffer+512));
+  for(r = 0; r < 12; ++r) {
+    BLAKE2B_ROUND_V8(v, w, r);
+  }
+  for(i = 0; i < 8; ++i) {
+    v[   i] = XOR(XOR(v[   i], v[ 8+i]), iv[i]);
+    v[16+i] = XOR(XOR(v[16+i], v[24+i]), iv[i]);
+  }
+
+  BLAKE2B_UNPACK_STATE_V4( s   ,  v    );
+  BLAKE2B_UNPACK_STATE_V4((s+8), (v+16));
+
+  memcpy(out, s, 8*64);
+#endif
+}
diff --git a/equi_miner.cpp b/equi_miner.cpp
index 63a7f03..0613cee 100644
--- a/equi_miner.cpp
+++ b/equi_miner.cpp
@@ -62,12 +62,7 @@ int main(int argc, char **argv) {
   thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
   assert(threads);
   equi eq(nthreads);
-  printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
-#ifdef USE_AVX2
-  printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
-#else
-  printf(" and no AVX2\n");
-#endif
+  printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES);
   u32 sumnsols = 0;
   char headernonce[HEADERNONCELEN];
   u32 hdrlen = strlen(header);
diff --git a/equi_miner.h b/equi_miner.h
index 69be380..1f04c50 100644
--- a/equi_miner.h
+++ b/equi_miner.h
@@ -580,31 +580,34 @@ struct equi {
     }
   };
 
-#ifdef USE_AVX2
-static const u32 BLAKESINPARALLEL = 4;
-#else
-static const u32 BLAKESINPARALLEL = 1;
+#ifndef NBLAKES
+#define NBLAKES 1
 #endif
-// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
-static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
+
+// number of hashes extracted from NBLAKES blake2b outputs
+static const u32 HASHESPERBLOCK = NBLAKES*HASHESPERBLAKE;
 // number of blocks of parallel blake2b calls
 static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
 
   void digit0(const u32 id) {
     htlayout htl(this, 0);
     const u32 hashbytes = hashsize(0);
-    uchar hashes[BLAKESINPARALLEL * 64];
+    uchar hashes[NBLAKES * 64];
     blake2b_state state0 = blake_ctx;  // local copy on stack can be copied faster
     for (u32 block = id; block < NBLOCKS; block += nthreads) {
-#ifdef USE_AVX2
+#if NBLAKES == 4
       blake2bx4_final(&state0, hashes, block);
-#else
+#elif NBLAKES == 8
+      blake2bx8_final(&state0, hashes, block);
+#elif NBLAKES == 1
       blake2b_state state = state0;  // make another copy since blake2b_final modifies it
       u32 leb = htole32(block);
       blake2b_update(&state, (uchar *)&leb, sizeof(u32));
       blake2b_final(&state, hashes, HASHOUT);
+#else
+#error not implemented
 #endif
-      for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
+      for (u32 i = 0; i<NBLAKES; i++) {
         for (u32 j = 0; j<HASHESPERBLAKE; j++) {
           const uchar *ph = hashes + i * 64 + j * WN/8;
           // figure out bucket for this hash by extracting leading BUCKBITS bits
@@ -628,7 +631,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
           // hash should end right before tag
           memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
           // round 0 tags store hash-generating index
-          s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
+          s->tag = tree((block * NBLAKES + i) * HASHESPERBLAKE + j);
         }
       }
     }