separate make targets for AVX2

2016-10-27 15:55:09 -04:00 · 2016-10-27 15:55:09 -04:00 · d3454d9228
parent a2ccd7a287
commit d3454d9228
7 changed files with 28 additions and 1135 deletions
--- a/14
+++ b/14
@ -4,11 +4,17 @@ GPP   = g++ -march=native -m64 -std=c++11 $(FLAGS)

 all:	equi equi1 verify test spark test1445

-equi:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi
+equi:	equi.h equi_miner.h equi_miner.cpp Makefile
+	$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi

-equi1:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
-	$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1
+equi1:	equi.h equi_miner.h equi_miner.cpp Makefile
+	$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
+
+eqavx2:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
+
+eqavx21:	equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
+	$(GPP) -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21

 equi1g:	equi.h equi_miner.h equi_miner.cpp Makefile
 	g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
--- a/README.md
+++ b/README.md
@ -51,10 +51,12 @@ More detailed documentation is available in the equi_miner.h source code.

 Performance summary (on 4GHz i7-4790K and NVidia GTX980):

- equi1:      4.6 Sol/s - 5.9 Sol/s (with AVX2)
- equi -t 8: 16.7 Sol/s
- 8 x equi1: 20.3 Sol/s
- dev1:       6.5 Sol/s (xenoncat's blake)
- 8 x dev1:  20.6 Sol/s
- dev -t 8:  17.2 Sol/s
- eqcuda:    23.6 Sol/s
+- equi1:        4.6 Sol/s
+- eqavx21:      5.9 Sol/s
+- equi -t 8:    4.6 Sol/s
+- eqavx2 -t 8:  TBA Sol/s
+- 8 x equi1:   20.3 Sol/s
+- dev1:         6.5 Sol/s (xenoncat's blake)
+- 8 x dev1:    20.6 Sol/s
+- dev -t 8:    17.2 Sol/s
+- eqcuda:      23.6 Sol/s
--- a/dev_miner.h
+++ b/dev_miner.h
@ -83,14 +83,6 @@ typedef u32 au32;
 #endif
 #endif

-#ifdef __AVX2__
-#define BLAKESINPARALLEL 4
-#elif defined __AVX__
-#define BLAKESINPARALLEL 2
-#else
-#define BLAKESINPARALLEL 1
-#endif 
-
 // number of buckets
 static const u32 NBUCKETS = 1<<BUCKBITS;
 // bucket mask
@ -105,10 +97,6 @@ static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
 static const u32 SLOTMASK = SLOTRANGE-1;
 // number of possible values of xhash (rest of n) bits
 static const u32 NRESTS = 1<<RESTBITS;
-// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
-static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
-// number of blocks of parallel blake2b calls
-static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
 // nothing larger found in 100000 runs
 static const u32 MAXSOLS = 8;

@ -532,6 +520,12 @@ struct equi {
    }
  };

+  static const u32 #define BLAKESINPARALLEL = 4;
+  // number of hashes extracted from BLAKESINPARALLEL blake2b outputs
+  static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
+  // number of blocks of parallel blake2b calls
+  static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
+
  void digit0(const u32 id) {
    htlayout htl(this, 0);
 #ifndef HASHONLY
--- a/equi_dev_miner.cpp
+++ b/equi_dev_miner.cpp
@ -1,84 +0,0 @@
-// Wagner's algorithm for Generalized Birthday Paradox, a memory-hard proof-of-work
-// Copyright (c) 2016 John Tromp
-
-#include "equi_dev_miner.h"
-#include <unistd.h>
-
-int main(int argc, char **argv) {
-  int nthreads = 1;
-  int nonce = 0;
-  int range = 1;
-  bool showsol = false;
-  const char *header = "";
-  int c;
-  while ((c = getopt (argc, argv, "h:n:r:t:s")) != -1) {
-    switch (c) {
-      case 'h':
-        header = optarg;
-        break;
-      case 'n':
-        nonce = atoi(optarg);
-        break;
-      case 'r':
-        range = atoi(optarg);
-        break;
-      case 's':
-        showsol = true;
-        break;
-      case 't':
-        nthreads = atoi(optarg);
-        break;
-    }
-  }
-#ifndef XWITHASH
-  if (sizeof(tree) > 4)
-    printf("WARNING: please compile with -DXWITHASH to shrink tree!\n");
-#endif
-#ifdef ATOMIC
-  if (nthreads==1)
-    printf("WARNING: use of atomics hurts single threaded performance!\n");
-#else
-  assert(nthreads==1);
-#endif
-  printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
-  if (range > 1)
-    printf("-%d", nonce+range-1);
-  printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
-  thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
-  assert(threads);
-  equi eq(nthreads);
-  printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000);
-  u32 sumnsols = 0;
-  char headernonce[HEADERNONCELEN];
-  u32 hdrlen = strlen(header);
-  memcpy(headernonce, header, hdrlen);
-  memset(headernonce+hdrlen, 0, sizeof(headernonce)-hdrlen);
-  for (int r = 0; r < range; r++) {
-    ((u32 *)headernonce)[32] = htole32(nonce+r);
-    eq.setheadernonce(headernonce, sizeof(headernonce));
-    for (int t = 0; t < nthreads; t++) {
-      threads[t].id = t;
-      threads[t].eq = &eq;
-      int err = pthread_create(&threads[t].thread, NULL, worker, (void *)&threads[t]);
-      assert(err == 0);
-    }
-    for (int t = 0; t < nthreads; t++) {
-      int err = pthread_join(threads[t].thread, NULL);
-      assert(err == 0);
-    }
-    u32 nsols = 0;
-    for (unsigned s = 0; s < eq.nsols; s++) {
-      nsols++;
-      if (showsol) {
-        printf("\nSolution");
-        for (u32 i = 0; i < PROOFSIZE; i++)
-          printf(" %jx", (uintmax_t)eq.sols[s][i]);
-      }
-    }
-    printf("\n%d solutions\n", nsols);
-    sumnsols += nsols;
-  }
-  free(threads);
-  printf("%d total solutions\n", sumnsols);
-  return 0;
-}
--- a/equi_dev_miner.h
+++ b/equi_dev_miner.h
--- a/equi_miner.cpp
+++ b/equi_miner.cpp
@ -48,7 +48,7 @@ int main(int argc, char **argv) {
  assert(threads);
  equi eq(nthreads);
  printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
-#ifdef __AVX2__ 
+#ifdef USE_AVX2
  printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
 #else
  printf("; no AVX2 detected\n");
--- a/equi_miner.h
+++ b/equi_miner.h
@ -564,7 +564,7 @@ struct equi {
    }
  };

-#ifdef __AVX2__
+#ifdef USE_AVX2
 static const u32 BLAKESINPARALLEL = 4;
 #else
 static const u32 BLAKESINPARALLEL = 1;
@ -580,7 +580,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
    uchar hashes[BLAKESINPARALLEL * 64];
    blake2b_state state0 = blake_ctx;
    for (u32 block = id; block < NBLOCKS; block += nthreads) {
-#ifdef __AVX2__
+#ifdef USE_AVX2
      blake2bip_final(&state0, hashes, block);
 #else
      blake2b_state state = state0;