Merge branch 'master' of github.com:tromp/equihash

merge whatever
2016-11-12 19:12:04 -05:00 · 2016-11-12 19:12:04 -05:00 · dda74a9dd7
parent 74ba1f8e76 525a16a1ff
commit dda74a9dd7
6 changed files with 115 additions and 54 deletions
--- a/21
+++ b/21
@ -10,11 +10,14 @@ equi:	equi.h equi_miner.h equi_miner.cpp Makefile
 equi1:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1

-eqavx2:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
+equix4:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4

-eqavx21:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21
+equix41:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41
+
+equix81:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41

 equi1g:	equi.h equi_miner.h equi_miner.cpp Makefile
 	g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
@ -25,11 +28,11 @@ eq1445:	equi.h equi_miner.h equi_miner.cpp Makefile
 eq14451:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o eq14451

-eq1445avx2:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -DATOMIC -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx2
+eq1445x4:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -DATOMIC -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x4

-eq1445avx21:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -mavx2 -DUSE_AVX2 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445avx21
+eq1445x41:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41

 dev:	equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile
 	$(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev
@ -83,4 +86,4 @@ blake2-asm/asm/zcblake2_avx2.o:
 	make -C blake2-asm

 clean:	
-	make -C blake2b clean && rm -f dev dev1 equi equi1 eqavx2 eqavx21 equi1g eq1445 eq14451 eq1445avx2 eq1445avx21 eqcuda eqcuda1445 verify
+	make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify
--- a/blake2-avx2/blake2bip.c
+++ b/blake2-avx2/blake2bip.c
@ -202,6 +202,28 @@ ALIGN(64) static const uint32_t indices[12][16] = {
  BLAKE2B_G_V4(m, r, 7, v[ 3], v[ 4], v[ 9], v[14]);  \
 } while(0)

+#define BLAKE2B_G_V8(m, r, i, a, b, c, d, e, f, g, h) do { \
+  a = ADD(a, LOAD((uint8_t const *)(m   ) + blake2b_sigma[r][2*i+0]));  \
+  e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+0]));  \
+  a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT32(d); h = ROT32(h); \
+  c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT24(b); f = ROT24(f); \
+  a = ADD(a, LOAD((uint8_t const *)(m   ) + blake2b_sigma[r][2*i+1]));  \
+  e = ADD(e, LOAD((uint8_t const *)(m+16) + blake2b_sigma[r][2*i+1]));  \
+  a = ADD(a, b); e = ADD(e, f); d = XOR(d, a); h = XOR(h, e); d = ROT16(d); h = ROT16(h); \
+  c = ADD(c, d); g = ADD(g, h); b = XOR(b, c); f = XOR(f, g); b = ROT63(b); f = ROT63(f); \
+} while(0)
+
+#define BLAKE2B_ROUND_V8(v, m, r) do {  \
+  BLAKE2B_G_V8(m, r, 0,   v[ 0], v[ 4], v[ 8], v[12],   v[16], v[20], v[24], v[28]);  \
+  BLAKE2B_G_V8(m, r, 1,   v[ 1], v[ 5], v[ 9], v[13],   v[17], v[21], v[25], v[29]);  \
+  BLAKE2B_G_V8(m, r, 2,   v[ 2], v[ 6], v[10], v[14],   v[18], v[22], v[26], v[30]);  \
+  BLAKE2B_G_V8(m, r, 3,   v[ 3], v[ 7], v[11], v[15],   v[19], v[23], v[27], v[31]);  \
+  BLAKE2B_G_V8(m, r, 4,   v[ 0], v[ 5], v[10], v[15],   v[16], v[21], v[26], v[31]);  \
+  BLAKE2B_G_V8(m, r, 5,   v[ 1], v[ 6], v[11], v[12],   v[17], v[22], v[27], v[28]);  \
+  BLAKE2B_G_V8(m, r, 6,   v[ 2], v[ 7], v[ 8], v[13],   v[18], v[23], v[24], v[29]);  \
+  BLAKE2B_G_V8(m, r, 7,   v[ 3], v[ 4], v[ 9], v[14],   v[19], v[20], v[25], v[30]);  \
+} while(0)
+
 #if defined(PERMUTE_WITH_GATHER)
 #define BLAKE2B_LOADMSG_V4(w, m) do {             \
  int i;                                          \
@ -345,3 +367,51 @@ void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) {

  memcpy(out, s, 256);
 }
+
+void blake2bx8_final(const blake2b_state *S, uchar *out, u32 blockidx) {
+#if 0
+  blake2bx4_final(S, out,     2*blockidx);
+  blake2bx4_final(S, out+256, 2*blockidx+1);
+#else
+  __m256i v[32], s[16], iv[8], w[32], counter, flag;
+  uint32_t b, i, r;
+
+  ALIGN(64) uint8_t buffer[8 * BLAKE2B_BLOCKBYTES];
+  memset(buffer, 0, 8 * BLAKE2B_BLOCKBYTES);
+  for (i = 0; i < 8; i++) {
+    memcpy(buffer+128*i, S->buf, S->buflen);
+    b = htole32(8 * blockidx + i);
+    memcpy(buffer+128*i + S->buflen, &b, sizeof(uint32_t));
+  }
+
+  for(i = 0; i < 8; ++i) {
+    v[16+i] = v[i] = iv[i] = _mm256_set1_epi64x(S->h[i]);
+  }
+
+  counter = _mm256_set1_epi64x(128 + S->buflen + sizeof(uint32_t));
+  flag    = _mm256_set1_epi64x(~0);
+
+  v[24] = v[ 8] = _mm256_set1_epi64x(blake2b_IV[0]);
+  v[25] = v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]);
+  v[26] = v[10] = _mm256_set1_epi64x(blake2b_IV[2]);
+  v[27] = v[11] = _mm256_set1_epi64x(blake2b_IV[3]);
+  v[28] = v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter);
+  v[29] = v[13] = _mm256_set1_epi64x(blake2b_IV[5]);
+  v[30] = v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag);
+  v[31] = v[15] = _mm256_set1_epi64x(blake2b_IV[7]);
+  BLAKE2B_LOADMSG_V4(w, buffer);
+  BLAKE2B_LOADMSG_V4((w+16), (buffer+512));
+  for(r = 0; r < 12; ++r) {
+    BLAKE2B_ROUND_V8(v, w, r);
+  }
+  for(i = 0; i < 8; ++i) {
+    v[   i] = XOR(XOR(v[   i], v[ 8+i]), iv[i]);
+    v[16+i] = XOR(XOR(v[16+i], v[24+i]), iv[i]);
+  }
+
+  BLAKE2B_UNPACK_STATE_V4( s   ,  v    );
+  BLAKE2B_UNPACK_STATE_V4((s+8), (v+16));
+
+  memcpy(out, s, 8*64);
+#endif
+}
--- a/dev_miner.h
+++ b/dev_miner.h
@ -347,46 +347,39 @@ struct equi {
    }
    return false;
  }
-  // if dupes != 0, list indices in arbitrary order and return true if dupe found
-  // if dupes == 0, order indices as in Wagner condition
-  bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) {
+  // order indices as in Wagner condition,
+  // and return true if a left and right subtree have identical leftmost leaves
+  bool listindices0(u32 r, const tree t, u32 *indices) {
    if (r == 0) {
-      u32 idx = t.getindex();
-      if (dupes) {
-        u32 bin = idx & (PROOFSIZE-1);
-        if (idx == dupes[bin]) return true;
-        dupes[bin] = idx;
-      }
-      *indices = idx;
+      *indices = t.getindex();
      return false;
    }
    const slot1 *buck = hta.heap1[t.bucketid()];
    const u32 size = 1 << --r;
    u32 tagi = hashwords(hashsize(r));
-    return listindices1(r, buck[t.slotid0()][tagi].tag, indices,      dupes)
-        || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
-        || (!dupes && orderindices(indices, size));
+    return listindices1(r, buck[t.slotid0()][tagi].tag, indices)
+        || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size)
+        || orderindices(indices, size) || indices[0] == indices[size];
  }
-  bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) {
+  bool listindices1(u32 r, const tree t, u32 *indices) {
    const slot0 *buck = hta.heap0[t.bucketid()];
    const u32 size = 1 << --r;
    u32 tagi = hashwords(hashsize(r));
-    return listindices0(r, buck[t.slotid0()][tagi].tag, indices,      dupes)
-        || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
-        || (!dupes && orderindices(indices, size));
+    return listindices0(r, buck[t.slotid0()][tagi].tag, indices)
+        || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size)
+        || orderindices(indices, size) || indices[0] == indices[size];
  }
  void candidate(const tree t) {
-    proof prf, dupes;
-    memset(dupes, 0xffff, sizeof(proof));
-    if (listindices1(WK, t, prf, dupes)) return; // assume WK odd
-    qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
-    for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
+    proof prf;
+    // listindices combines index tree reconstruction with probably dupe test
+    if (listindices1(WK, t, prf) || duped(prf)) return; // assume WK odd
+    // and now we have ourselves a genuine solution
 #ifdef ATOMIC
    u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
 #else
    u32 soli = nsols++;
 #endif
-    if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd
+    if (soli < MAXSOLS) memcpy(sols[soli], prf, sizeof(proof));
  }
 #endif
  void showbsizes(u32 r) {
--- a/equi_miner.cpp
+++ b/equi_miner.cpp
@ -62,12 +62,7 @@ int main(int argc, char **argv) {
  thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
  assert(threads);
  equi eq(nthreads);
-  printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
-#ifdef USE_AVX2
-  printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
-#else
-  printf(" and no AVX2\n");
-#endif
+  printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES);
  u32 sumnsols = 0;
  char headernonce[HEADERNONCELEN];
  u32 hdrlen = strlen(header);
--- a/equi_miner.cu
+++ b/equi_miner.cu
@ -985,7 +985,6 @@ int main(int argc, char **argv) {
    cudaEventElapsedTime(&duration, start, stop);
      printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f);

-    printf("%d candidate solutions\n", eq.nsols);
    u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols);
    for (s = nsols = 0; s < maxsols; s++) {
      if (duped(sols[s])) {
--- a/equi_miner.h
+++ b/equi_miner.h
@ -394,12 +394,10 @@ struct equi {
    }
    return false;
  }
-  // if dupes != 0, list indices in arbitrary order and return true if dupe found
-  // if dupes == 0, order indices as in Wagner condition
+  // listindices combines index tree reconstruction with probably dupe test
  bool listindices0(u32 r, const tree t, u32 *indices) {
    if (r == 0) {
-      u32 idx = t.getindex();
-      *indices = idx;
+      *indices = t.getindex();
      return false;
    }
    const slot1 *buck = hta.heap1[t.bucketid()];
@ -582,31 +580,34 @@ struct equi {
    }
  };

-#ifdef USE_AVX2
-static const u32 BLAKESINPARALLEL = 4;
-#else
-static const u32 BLAKESINPARALLEL = 1;
+#ifndef NBLAKES
+#define NBLAKES 1
 #endif
-// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
-static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
+
+// number of hashes extracted from NBLAKES blake2b outputs
+static const u32 HASHESPERBLOCK = NBLAKES*HASHESPERBLAKE;
 // number of blocks of parallel blake2b calls
 static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;

  void digit0(const u32 id) {
    htlayout htl(this, 0);
    const u32 hashbytes = hashsize(0);
-    uchar hashes[BLAKESINPARALLEL * 64];
+    uchar hashes[NBLAKES * 64];
    blake2b_state state0 = blake_ctx;  // local copy on stack can be copied faster
    for (u32 block = id; block < NBLOCKS; block += nthreads) {
-#ifdef USE_AVX2
+#if NBLAKES == 4
      blake2bx4_final(&state0, hashes, block);
-#else
+#elif NBLAKES == 8
+      blake2bx8_final(&state0, hashes, block);
+#elif NBLAKES == 1
      blake2b_state state = state0;  // make another copy since blake2b_final modifies it
      u32 leb = htole32(block);
      blake2b_update(&state, (uchar *)&leb, sizeof(u32));
      blake2b_final(&state, hashes, HASHOUT);
+#else
+#error not implemented
 #endif
-      for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
+      for (u32 i = 0; i<NBLAKES; i++) {
        for (u32 j = 0; j<HASHESPERBLAKE; j++) {
          const uchar *ph = hashes + i * 64 + j * WN/8;
          // figure out bucket for this hash by extracting leading BUCKBITS bits
@ -630,7 +631,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
          // hash should end right before tag
          memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
          // round 0 tags store hash-generating index
-          s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
+          s->tag = tree((block * NBLAKES + i) * HASHESPERBLAKE + j);
        }
      }
    }