separate make targets for AVX2

This commit is contained in:
tromp 2016-10-27 15:55:09 -04:00
parent a2ccd7a287
commit d3454d9228
7 changed files with 28 additions and 1135 deletions

View File

@ -4,11 +4,17 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
all: equi equi1 verify test spark test1445
equi: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi
equi: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi
equi1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g

View File

@ -51,10 +51,12 @@ More detailed documentation is available in the equi_miner.h source code.
Performance summary (on 4GHz i7-4790K and NVidia GTX980):
- equi1: 4.6 Sol/s - 5.9 Sol/s (with AVX2)
- equi -t 8: 16.7 Sol/s
- 8 x equi1: 20.3 Sol/s
- dev1: 6.5 Sol/s (xenoncat's blake)
- 8 x dev1: 20.6 Sol/s
- dev -t 8: 17.2 Sol/s
- eqcuda: 23.6 Sol/s
- equi1: 4.6 Sol/s
- eqavx21: 5.9 Sol/s
- equi -t 8: 4.6 Sol/s
- eqavx2 -t 8: TBA Sol/s
- 8 x equi1: 20.3 Sol/s
- dev1: 6.5 Sol/s (xenoncat's blake)
- 8 x dev1: 20.6 Sol/s
- dev -t 8: 17.2 Sol/s
- eqcuda: 23.6 Sol/s

View File

@ -83,14 +83,6 @@ typedef u32 au32;
#endif
#endif
#ifdef __AVX2__
#define BLAKESINPARALLEL 4
#elif defined __AVX__
#define BLAKESINPARALLEL 2
#else
#define BLAKESINPARALLEL 1
#endif
// number of buckets
static const u32 NBUCKETS = 1<<BUCKBITS;
// bucket mask
@ -105,10 +97,6 @@ static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
static const u32 SLOTMASK = SLOTRANGE-1;
// number of possible values of xhash (rest of n) bits
static const u32 NRESTS = 1<<RESTBITS;
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
// number of blocks of parallel blake2b calls
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
// nothing larger found in 100000 runs
static const u32 MAXSOLS = 8;
@ -532,6 +520,12 @@ struct equi {
}
};
static const u32 #define BLAKESINPARALLEL = 4;
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
// number of blocks of parallel blake2b calls
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
void digit0(const u32 id) {
htlayout htl(this, 0);
#ifndef HASHONLY

View File

@ -1,84 +0,0 @@
// Wagner's algorithm for Generalized Birthday Paradox, a memory-hard proof-of-work
// Copyright (c) 2016 John Tromp
#include "equi_dev_miner.h"
#include <unistd.h>
int main(int argc, char **argv) {
int nthreads = 1;
int nonce = 0;
int range = 1;
bool showsol = false;
const char *header = "";
int c;
while ((c = getopt (argc, argv, "h:n:r:t:s")) != -1) {
switch (c) {
case 'h':
header = optarg;
break;
case 'n':
nonce = atoi(optarg);
break;
case 'r':
range = atoi(optarg);
break;
case 's':
showsol = true;
break;
case 't':
nthreads = atoi(optarg);
break;
}
}
#ifndef XWITHASH
if (sizeof(tree) > 4)
printf("WARNING: please compile with -DXWITHASH to shrink tree!\n");
#endif
#ifdef ATOMIC
if (nthreads==1)
printf("WARNING: use of atomics hurts single threaded performance!\n");
#else
assert(nthreads==1);
#endif
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
if (range > 1)
printf("-%d", nonce+range-1);
printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000);
u32 sumnsols = 0;
char headernonce[HEADERNONCELEN];
u32 hdrlen = strlen(header);
memcpy(headernonce, header, hdrlen);
memset(headernonce+hdrlen, 0, sizeof(headernonce)-hdrlen);
for (int r = 0; r < range; r++) {
((u32 *)headernonce)[32] = htole32(nonce+r);
eq.setheadernonce(headernonce, sizeof(headernonce));
for (int t = 0; t < nthreads; t++) {
threads[t].id = t;
threads[t].eq = &eq;
int err = pthread_create(&threads[t].thread, NULL, worker, (void *)&threads[t]);
assert(err == 0);
}
for (int t = 0; t < nthreads; t++) {
int err = pthread_join(threads[t].thread, NULL);
assert(err == 0);
}
u32 nsols = 0;
for (unsigned s = 0; s < eq.nsols; s++) {
nsols++;
if (showsol) {
printf("\nSolution");
for (u32 i = 0; i < PROOFSIZE; i++)
printf(" %jx", (uintmax_t)eq.sols[s][i]);
}
}
printf("\n%d solutions\n", nsols);
sumnsols += nsols;
}
free(threads);
printf("%d total solutions\n", sumnsols);
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -48,7 +48,7 @@ int main(int argc, char **argv) {
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
#ifdef __AVX2__
#ifdef USE_AVX2
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
#else
printf("; no AVX2 detected\n");

View File

@ -564,7 +564,7 @@ struct equi {
}
};
#ifdef __AVX2__
#ifdef USE_AVX2
static const u32 BLAKESINPARALLEL = 4;
#else
static const u32 BLAKESINPARALLEL = 1;
@ -580,7 +580,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
uchar hashes[BLAKESINPARALLEL * 64];
blake2b_state state0 = blake_ctx;
for (u32 block = id; block < NBLOCKS; block += nthreads) {
#ifdef __AVX2__
#ifdef USE_AVX2
blake2bip_final(&state0, hashes, block);
#else
blake2b_state state = state0;