obsolete faster by speeding up low-mem versions
This commit is contained in:
parent
c2a84f98b9
commit
f06ff12ed5
|
@ -0,0 +1,11 @@
|
|||
equi
|
||||
equi1
|
||||
equi1g
|
||||
faster
|
||||
faster1
|
||||
equi965
|
||||
equi1445
|
||||
eqcuda
|
||||
eqcuda1445
|
||||
feqcuda
|
||||
verify
|
|
@ -3,9 +3,7 @@ The MIT License (MIT)
|
|||
Copyright (c) 2016 John Tromp
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM
|
||||
https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu,
|
||||
and associated documentation files (the "Software"), to deal
|
||||
of this software, and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
|
|
20
Makefile
20
Makefile
|
@ -2,23 +2,17 @@ OPT = -O3
|
|||
FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread
|
||||
GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
|
||||
|
||||
all: equi equi1 faster faster1 verify test spark
|
||||
all: equi equi1 verify test spark
|
||||
|
||||
equi: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi
|
||||
|
||||
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
|
||||
$(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1
|
||||
|
||||
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
|
||||
|
||||
faster: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster
|
||||
|
||||
faster1: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1
|
||||
|
||||
equi965: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965
|
||||
|
||||
|
@ -37,14 +31,14 @@ feqcuda: equi_miner.cu equi.h blake2b.cu Makefile
|
|||
verify: equi.h equi.c Makefile
|
||||
g++ -g equi.c blake/blake2b.cpp -o verify
|
||||
|
||||
bench: equi
|
||||
time for i in {0..9}; do ./faster -n $$i; done
|
||||
bench: equi1
|
||||
time ./equi1 -n 1000 -r 100
|
||||
|
||||
test: equi verify Makefile
|
||||
time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0
|
||||
|
||||
spark: equi1
|
||||
time ./equi1
|
||||
spark: equi1g
|
||||
time ./equi1g
|
||||
|
||||
clean:
|
||||
rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
|
||||
rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
// Blake2-B CUDA Implementation
|
||||
// tpruvot@github July 2016
|
||||
// permission granted to use under MIT license
|
||||
// modified for use in Zcash by John Tromp September 2016
|
||||
|
||||
/**
|
||||
* uint2 direct ops by c++ operator definitions
|
||||
|
|
|
@ -43,10 +43,11 @@ int main(int argc, char **argv) {
|
|||
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
|
||||
if (range > 1)
|
||||
printf("-%d", nonce+range-1);
|
||||
printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
|
||||
printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
|
||||
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
|
||||
assert(threads);
|
||||
equi eq(nthreads);
|
||||
printf("Using %dMB of memory\n", eq.hta.alloced >> 20);
|
||||
u32 sumnsols = 0;
|
||||
for (int r = 0; r < range; r++) {
|
||||
eq.setnonce(header, nonce+r);
|
||||
|
|
137
equi_miner.h
137
equi_miner.h
|
@ -109,87 +109,52 @@ u32 htunits(u32 bytes) {
|
|||
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
|
||||
}
|
||||
|
||||
#ifdef JOINHT
|
||||
u32 slotsize(const u32 r) {
|
||||
return 1 + htunits(hashsize(r));
|
||||
}
|
||||
// size (in htunits) of bucket in round 0 <= r < WK
|
||||
u32 bucketsize(const u32 r) {
|
||||
return NSLOTS * slotsize(r);
|
||||
}
|
||||
#else
|
||||
u32 slotsize(const u32 r) {
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
// manages hash and tree data
|
||||
struct htalloc {
|
||||
// Defining JOINHT joins each tree with its corresponding hash,
|
||||
// so they may share a cache line. This gives a small speed
|
||||
// advantage but comes at the cost of a big memory increase
|
||||
// as hash-space can no longer be reclaimed
|
||||
#ifdef JOINHT
|
||||
htunit *trees[WK];
|
||||
#else
|
||||
bucket *trees[WK];
|
||||
htunit *hashes[WK];
|
||||
#endif
|
||||
u64 alloced;
|
||||
u32 alloced;
|
||||
htalloc() {
|
||||
alloced = 0;
|
||||
}
|
||||
void alloctrees() {
|
||||
#ifdef JOINHT
|
||||
for (int r=0; r<WK; r++)
|
||||
trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
|
||||
#else
|
||||
// optimize xenoncat's fixed memory layout, avoiding any waste
|
||||
// digit trees hashes trees
|
||||
// 0 0 A A A A A A . . . . . .
|
||||
// 1 0 A A A A A A B B B B B 1
|
||||
// 2 0 2 C C C C C B B B B B 1
|
||||
// 3 0 2 C C C C C D D D D 3 1
|
||||
// 4 0 2 4 E E E E D D D D 3 1
|
||||
// 5 0 2 4 E E E E F F F 5 3 1
|
||||
// 6 0 2 4 6 . G G F F F 5 3 1
|
||||
// 7 0 2 4 6 . G G H H 7 5 3 1
|
||||
// 8 0 2 4 6 8 . I H H 7 5 3 1
|
||||
// digit trees hashes trees hashes
|
||||
// 0 0 A A A A A A . . . . . .
|
||||
// 1 0 A A A A A A 1 B B B B B
|
||||
// 2 0 2 C C C C C 1 B B B B B
|
||||
// 3 0 2 C C C C C 1 3 D D D D
|
||||
// 4 0 2 4 E E E E 1 3 D D D D
|
||||
// 5 0 2 4 E E E E 1 3 5 F F F
|
||||
// 6 0 2 4 6 . G G 1 3 5 F F F
|
||||
// 7 0 2 4 6 . G G 1 3 5 7 H H
|
||||
// 8 0 2 4 6 8 . I 1 3 5 7 H H
|
||||
assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits
|
||||
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
|
||||
digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit));
|
||||
for (int r=0; r<WK; r++) {
|
||||
trees[r] = (bucket *)(heap + (r&1 ? 1+units0+units1-r/2 : r/2));
|
||||
hashes[r] = (htunit *)(heap + (r&1 ? 1+units0 : 1+r/2));
|
||||
}
|
||||
#endif
|
||||
digit *heap[2];
|
||||
for (u32 i =0; i < 2; i++)
|
||||
heap[i] = (digit *)alloc(1 + htunits(hashsize(i)), sizeof(digit));
|
||||
for (int r=0; r<WK; r++)
|
||||
trees[r] = (htunit *)heap[r&1] + r/2;
|
||||
}
|
||||
void dealloctrees() {
|
||||
#ifdef JOINHT
|
||||
for (int r=0; r<WK; r++)
|
||||
dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
|
||||
#else
|
||||
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
|
||||
dealloc(trees[0], 1+units0+units1+1, sizeof(digit));
|
||||
#endif
|
||||
for (u32 i =0; i < 2; i++)
|
||||
free(trees[i]);
|
||||
}
|
||||
u32 slotsize(const u32 r) const {
|
||||
return 1 + htunits(hashsize(r&1));
|
||||
}
|
||||
// size (in htunits) of bucket in round 0 <= r < WK
|
||||
u32 bucketsize(const u32 r) const {
|
||||
return NSLOTS * slotsize(r);
|
||||
}
|
||||
htunit *getbucket(u32 r, u32 bid) const {
|
||||
#ifdef JOINHT
|
||||
return &trees[r][bid * bucketsize(r)];
|
||||
#else
|
||||
return trees[r][bid];
|
||||
#endif
|
||||
}
|
||||
void *alloc(const u32 n, const u32 sz) {
|
||||
void *mem = calloc(n, sz);
|
||||
assert(mem);
|
||||
alloced += (u64)n * sz;
|
||||
alloced += n * sz;
|
||||
return mem;
|
||||
}
|
||||
void dealloc(void *mem, const u32 n, const u32 sz) {
|
||||
free(mem);
|
||||
alloced -= (u64)n * sz;
|
||||
}
|
||||
};
|
||||
|
||||
typedef au32 bsizes[NBUCKETS];
|
||||
|
@ -249,8 +214,8 @@ struct equi {
|
|||
const htunit *bt = hta.getbucket(--r,t.bucketid);
|
||||
const u32 size = 1 << r;
|
||||
u32 *indices1 = indices + size;
|
||||
listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
|
||||
listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
|
||||
listindices(r, bt[t.slotid0 * hta.slotsize(r)].attr, indices);
|
||||
listindices(r, bt[t.slotid1 * hta.slotsize(r)].attr, indices1);
|
||||
if (*indices > *indices1) {
|
||||
for (u32 i=0; i < size; i++) {
|
||||
const u32 tmp = indices[i];
|
||||
|
@ -292,7 +257,7 @@ struct equi {
|
|||
printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
|
||||
#endif
|
||||
}
|
||||
printf(" %ld MB\n", hta.alloced >> 20);
|
||||
printf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -300,65 +265,47 @@ struct equi {
|
|||
htalloc hta;
|
||||
u32 prevhtunits;
|
||||
u32 nexthtunits;
|
||||
u32 prevslotunits;
|
||||
u32 nextslotunits;
|
||||
u32 dunits;
|
||||
u32 prevbo;
|
||||
u32 nextbo;
|
||||
htunit *buck;
|
||||
htunit *hashbase;
|
||||
|
||||
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
|
||||
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) {
|
||||
u32 nexthashbytes = hashsize(r);
|
||||
nexthtunits = htunits(nexthashbytes);
|
||||
nextslotunits = 1 + htunits(hashsize(r&1));
|
||||
prevbo = 0;
|
||||
nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
|
||||
if (r) {
|
||||
u32 prevhashbytes = hashsize(r-1);
|
||||
prevhtunits = htunits(prevhashbytes);
|
||||
prevslotunits = 1 + htunits(hashsize((r-1)&1));
|
||||
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
|
||||
dunits = prevhtunits - nexthtunits;
|
||||
}
|
||||
#ifdef JOINHT
|
||||
nexthtunits++;
|
||||
prevhtunits++;
|
||||
#endif
|
||||
}
|
||||
void setbucket(u32 r, u32 bid) {
|
||||
buck = hta.getbucket(r, bid);
|
||||
#ifdef JOINHT
|
||||
hashbase = buck + 1;
|
||||
#else
|
||||
hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
|
||||
#endif
|
||||
}
|
||||
u32 getxhash(const u32 slot, const htunit *hash) const {
|
||||
#ifdef XWITHASH
|
||||
return hash->bytes[prevbo] & 0xf;
|
||||
#elif defined JOINHT
|
||||
return buck[slot * prevhtunits].attr.xhash;
|
||||
#else
|
||||
return buck[slot].attr.xhash;
|
||||
#endif
|
||||
}
|
||||
u32 prevhashunits() const {
|
||||
#ifdef JOINHT
|
||||
return prevhtunits - 1;
|
||||
#else
|
||||
return prevhtunits;
|
||||
return buck[slot * prevslotunits].attr.xhash;
|
||||
#endif
|
||||
}
|
||||
bool equal(const htunit *hash0, const htunit *hash1) const {
|
||||
return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
|
||||
return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash;
|
||||
}
|
||||
htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
|
||||
htunit *buck = hta.getbucket(r,bid);
|
||||
#ifdef JOINHT
|
||||
htunit *slotree = buck + slot * nexthtunits;
|
||||
htunit *slotree = buck + slot * nextslotunits;
|
||||
slotree->attr = t;
|
||||
return slotree + 1;
|
||||
#else
|
||||
buck[slot].attr = t;
|
||||
return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -467,14 +414,14 @@ struct equi {
|
|||
htl.setbucket(r-1, bucketid);
|
||||
u32 bsize = getnslots(r-1, bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
|
||||
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
|
||||
if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
|
||||
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
|
||||
if (htl.equal(hash0, hash1)) {
|
||||
hfull++;
|
||||
continue;
|
||||
|
@ -511,7 +458,7 @@ struct equi {
|
|||
xort.xhash = xhash;
|
||||
#endif
|
||||
htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
|
||||
for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
|
||||
for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
|
||||
xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
|
||||
}
|
||||
}
|
||||
|
@ -526,12 +473,12 @@ struct equi {
|
|||
htl.setbucket(WK-1, bucketid);
|
||||
u32 bsize = getnslots(WK-1, bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
|
||||
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
|
||||
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
|
||||
continue;
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
|
||||
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
|
||||
if (htl.equal(hash0, hash1)) {
|
||||
tree xort; xort.bucketid = bucketid;
|
||||
xort.slotid0 = s0; xort.slotid1 = s1;
|
||||
|
|
Loading…
Reference in New Issue