separate make targets for AVX2
This commit is contained in:
parent
a2ccd7a287
commit
d3454d9228
14
Makefile
14
Makefile
|
@ -4,11 +4,17 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
|
||||||
|
|
||||||
all: equi equi1 verify test spark test1445
|
all: equi equi1 verify test spark test1445
|
||||||
|
|
||||||
equi: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
equi: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||||
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi
|
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi
|
||||||
|
|
||||||
equi1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||||
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1
|
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
|
||||||
|
|
||||||
|
eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
||||||
|
$(GPP) -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2
|
||||||
|
|
||||||
|
eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
||||||
|
$(GPP) -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21
|
||||||
|
|
||||||
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
|
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||||
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
|
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
|
||||||
|
|
16
README.md
16
README.md
|
@ -51,10 +51,12 @@ More detailed documentation is available in the equi_miner.h source code.
|
||||||
|
|
||||||
Performance summary (on 4GHz i7-4790K and NVidia GTX980):
|
Performance summary (on 4GHz i7-4790K and NVidia GTX980):
|
||||||
|
|
||||||
- equi1: 4.6 Sol/s - 5.9 Sol/s (with AVX2)
|
- equi1: 4.6 Sol/s
|
||||||
- equi -t 8: 16.7 Sol/s
|
- eqavx21: 5.9 Sol/s
|
||||||
- 8 x equi1: 20.3 Sol/s
|
- equi -t 8: 4.6 Sol/s
|
||||||
- dev1: 6.5 Sol/s (xenoncat's blake)
|
- eqavx2 -t 8: TBA Sol/s
|
||||||
- 8 x dev1: 20.6 Sol/s
|
- 8 x equi1: 20.3 Sol/s
|
||||||
- dev -t 8: 17.2 Sol/s
|
- dev1: 6.5 Sol/s (xenoncat's blake)
|
||||||
- eqcuda: 23.6 Sol/s
|
- 8 x dev1: 20.6 Sol/s
|
||||||
|
- dev -t 8: 17.2 Sol/s
|
||||||
|
- eqcuda: 23.6 Sol/s
|
||||||
|
|
18
dev_miner.h
18
dev_miner.h
|
@ -83,14 +83,6 @@ typedef u32 au32;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX2__
|
|
||||||
#define BLAKESINPARALLEL 4
|
|
||||||
#elif defined __AVX__
|
|
||||||
#define BLAKESINPARALLEL 2
|
|
||||||
#else
|
|
||||||
#define BLAKESINPARALLEL 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// number of buckets
|
// number of buckets
|
||||||
static const u32 NBUCKETS = 1<<BUCKBITS;
|
static const u32 NBUCKETS = 1<<BUCKBITS;
|
||||||
// bucket mask
|
// bucket mask
|
||||||
|
@ -105,10 +97,6 @@ static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
|
||||||
static const u32 SLOTMASK = SLOTRANGE-1;
|
static const u32 SLOTMASK = SLOTRANGE-1;
|
||||||
// number of possible values of xhash (rest of n) bits
|
// number of possible values of xhash (rest of n) bits
|
||||||
static const u32 NRESTS = 1<<RESTBITS;
|
static const u32 NRESTS = 1<<RESTBITS;
|
||||||
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
|
|
||||||
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
|
|
||||||
// number of blocks of parallel blake2b calls
|
|
||||||
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
|
|
||||||
// nothing larger found in 100000 runs
|
// nothing larger found in 100000 runs
|
||||||
static const u32 MAXSOLS = 8;
|
static const u32 MAXSOLS = 8;
|
||||||
|
|
||||||
|
@ -532,6 +520,12 @@ struct equi {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const u32 #define BLAKESINPARALLEL = 4;
|
||||||
|
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
|
||||||
|
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
|
||||||
|
// number of blocks of parallel blake2b calls
|
||||||
|
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
|
||||||
|
|
||||||
void digit0(const u32 id) {
|
void digit0(const u32 id) {
|
||||||
htlayout htl(this, 0);
|
htlayout htl(this, 0);
|
||||||
#ifndef HASHONLY
|
#ifndef HASHONLY
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
// Wagner's algorithm for Generalized Birthday Paradox, a memory-hard proof-of-work
|
|
||||||
// Copyright (c) 2016 John Tromp
|
|
||||||
|
|
||||||
#include "equi_dev_miner.h"
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
int nthreads = 1;
|
|
||||||
int nonce = 0;
|
|
||||||
int range = 1;
|
|
||||||
bool showsol = false;
|
|
||||||
const char *header = "";
|
|
||||||
int c;
|
|
||||||
while ((c = getopt (argc, argv, "h:n:r:t:s")) != -1) {
|
|
||||||
switch (c) {
|
|
||||||
case 'h':
|
|
||||||
header = optarg;
|
|
||||||
break;
|
|
||||||
case 'n':
|
|
||||||
nonce = atoi(optarg);
|
|
||||||
break;
|
|
||||||
case 'r':
|
|
||||||
range = atoi(optarg);
|
|
||||||
break;
|
|
||||||
case 's':
|
|
||||||
showsol = true;
|
|
||||||
break;
|
|
||||||
case 't':
|
|
||||||
nthreads = atoi(optarg);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifndef XWITHASH
|
|
||||||
if (sizeof(tree) > 4)
|
|
||||||
printf("WARNING: please compile with -DXWITHASH to shrink tree!\n");
|
|
||||||
#endif
|
|
||||||
#ifdef ATOMIC
|
|
||||||
if (nthreads==1)
|
|
||||||
printf("WARNING: use of atomics hurts single threaded performance!\n");
|
|
||||||
#else
|
|
||||||
assert(nthreads==1);
|
|
||||||
#endif
|
|
||||||
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
|
|
||||||
if (range > 1)
|
|
||||||
printf("-%d", nonce+range-1);
|
|
||||||
printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
|
|
||||||
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
|
|
||||||
assert(threads);
|
|
||||||
equi eq(nthreads);
|
|
||||||
printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000);
|
|
||||||
u32 sumnsols = 0;
|
|
||||||
char headernonce[HEADERNONCELEN];
|
|
||||||
u32 hdrlen = strlen(header);
|
|
||||||
memcpy(headernonce, header, hdrlen);
|
|
||||||
memset(headernonce+hdrlen, 0, sizeof(headernonce)-hdrlen);
|
|
||||||
for (int r = 0; r < range; r++) {
|
|
||||||
((u32 *)headernonce)[32] = htole32(nonce+r);
|
|
||||||
eq.setheadernonce(headernonce, sizeof(headernonce));
|
|
||||||
for (int t = 0; t < nthreads; t++) {
|
|
||||||
threads[t].id = t;
|
|
||||||
threads[t].eq = &eq;
|
|
||||||
int err = pthread_create(&threads[t].thread, NULL, worker, (void *)&threads[t]);
|
|
||||||
assert(err == 0);
|
|
||||||
}
|
|
||||||
for (int t = 0; t < nthreads; t++) {
|
|
||||||
int err = pthread_join(threads[t].thread, NULL);
|
|
||||||
assert(err == 0);
|
|
||||||
}
|
|
||||||
u32 nsols = 0;
|
|
||||||
for (unsigned s = 0; s < eq.nsols; s++) {
|
|
||||||
nsols++;
|
|
||||||
if (showsol) {
|
|
||||||
printf("\nSolution");
|
|
||||||
for (u32 i = 0; i < PROOFSIZE; i++)
|
|
||||||
printf(" %jx", (uintmax_t)eq.sols[s][i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("\n%d solutions\n", nsols);
|
|
||||||
sumnsols += nsols;
|
|
||||||
}
|
|
||||||
free(threads);
|
|
||||||
printf("%d total solutions\n", sumnsols);
|
|
||||||
return 0;
|
|
||||||
}
|
|
1025
equi_dev_miner.h
1025
equi_dev_miner.h
File diff suppressed because it is too large
Load Diff
|
@ -48,7 +48,7 @@ int main(int argc, char **argv) {
|
||||||
assert(threads);
|
assert(threads);
|
||||||
equi eq(nthreads);
|
equi eq(nthreads);
|
||||||
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
|
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
|
||||||
#ifdef __AVX2__
|
#ifdef USE_AVX2
|
||||||
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
|
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
|
||||||
#else
|
#else
|
||||||
printf("; no AVX2 detected\n");
|
printf("; no AVX2 detected\n");
|
||||||
|
|
|
@ -564,7 +564,7 @@ struct equi {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef USE_AVX2
|
||||||
static const u32 BLAKESINPARALLEL = 4;
|
static const u32 BLAKESINPARALLEL = 4;
|
||||||
#else
|
#else
|
||||||
static const u32 BLAKESINPARALLEL = 1;
|
static const u32 BLAKESINPARALLEL = 1;
|
||||||
|
@ -580,7 +580,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
|
||||||
uchar hashes[BLAKESINPARALLEL * 64];
|
uchar hashes[BLAKESINPARALLEL * 64];
|
||||||
blake2b_state state0 = blake_ctx;
|
blake2b_state state0 = blake_ctx;
|
||||||
for (u32 block = id; block < NBLOCKS; block += nthreads) {
|
for (u32 block = id; block < NBLOCKS; block += nthreads) {
|
||||||
#ifdef __AVX2__
|
#ifdef USE_AVX2
|
||||||
blake2bip_final(&state0, hashes, block);
|
blake2bip_final(&state0, hashes, block);
|
||||||
#else
|
#else
|
||||||
blake2b_state state = state0;
|
blake2b_state state = state0;
|
||||||
|
|
Loading…
Reference in New Issue