obsolete faster by speeding up low-mem versions

This commit is contained in:
tromp 2016-10-16 18:49:03 -04:00
parent c2a84f98b9
commit f06ff12ed5
6 changed files with 65 additions and 112 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
equi
equi1
equi1g
faster
faster1
equi965
equi1445
eqcuda
eqcuda1445
feqcuda
verify

View File

@ -3,9 +3,7 @@ The MIT License (MIT)
Copyright (c) 2016 John Tromp
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM
https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu,
and associated documentation files (the "Software"), to deal
of this software, and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is

View File

@ -2,23 +2,17 @@ OPT = -O3
FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread
GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
all: equi equi1 faster faster1 verify test spark
all: equi equi1 verify test spark
equi: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
$(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
faster: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster
faster1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1
equi965: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965
@ -37,14 +31,14 @@ feqcuda: equi_miner.cu equi.h blake2b.cu Makefile
verify: equi.h equi.c Makefile
g++ -g equi.c blake/blake2b.cpp -o verify
bench: equi
time for i in {0..9}; do ./faster -n $$i; done
bench: equi1
time ./equi1 -n 1000 -r 100
test: equi verify Makefile
time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0
spark: equi1
time ./equi1
spark: equi1g
time ./equi1g
clean:
rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify

View File

@ -1,5 +1,7 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions

View File

@ -43,10 +43,11 @@ int main(int argc, char **argv) {
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
if (range > 1)
printf("-%d", nonce+range-1);
printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory\n", eq.hta.alloced >> 20);
u32 sumnsols = 0;
for (int r = 0; r < range; r++) {
eq.setnonce(header, nonce+r);

View File

@ -109,87 +109,52 @@ u32 htunits(u32 bytes) {
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
}
#ifdef JOINHT
u32 slotsize(const u32 r) {
return 1 + htunits(hashsize(r));
}
// size (in htunits) of bucket in round 0 <= r < WK
u32 bucketsize(const u32 r) {
return NSLOTS * slotsize(r);
}
#else
u32 slotsize(const u32 r) {
return 1;
}
#endif
// manages hash and tree data
struct htalloc {
// Defining JOINHT joins each tree with its corresponding hash,
// so they may share a cache line. This gives a small speed
// advantage but comes at the cost of a big memory increase
// as hash-space can no longer be reclaimed
#ifdef JOINHT
htunit *trees[WK];
#else
bucket *trees[WK];
htunit *hashes[WK];
#endif
u64 alloced;
u32 alloced;
htalloc() {
alloced = 0;
}
void alloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#else
// optimize xenoncat's fixed memory layout, avoiding any waste
// digit trees hashes trees
// 0 0 A A A A A A . . . . . .
// 1 0 A A A A A A B B B B B 1
// 2 0 2 C C C C C B B B B B 1
// 3 0 2 C C C C C D D D D 3 1
// 4 0 2 4 E E E E D D D D 3 1
// 5 0 2 4 E E E E F F F 5 3 1
// 6 0 2 4 6 . G G F F F 5 3 1
// 7 0 2 4 6 . G G H H 7 5 3 1
// 8 0 2 4 6 8 . I H H 7 5 3 1
// digit trees hashes trees hashes
// 0 0 A A A A A A . . . . . .
// 1 0 A A A A A A 1 B B B B B
// 2 0 2 C C C C C 1 B B B B B
// 3 0 2 C C C C C 1 3 D D D D
// 4 0 2 4 E E E E 1 3 D D D D
// 5 0 2 4 E E E E 1 3 5 F F F
// 6 0 2 4 6 . G G 1 3 5 F F F
// 7 0 2 4 6 . G G 1 3 5 7 H H
// 8 0 2 4 6 8 . I 1 3 5 7 H H
assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit));
for (int r=0; r<WK; r++) {
trees[r] = (bucket *)(heap + (r&1 ? 1+units0+units1-r/2 : r/2));
hashes[r] = (htunit *)(heap + (r&1 ? 1+units0 : 1+r/2));
}
#endif
digit *heap[2];
for (u32 i =0; i < 2; i++)
heap[i] = (digit *)alloc(1 + htunits(hashsize(i)), sizeof(digit));
for (int r=0; r<WK; r++)
trees[r] = (htunit *)heap[r&1] + r/2;
}
void dealloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#else
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
dealloc(trees[0], 1+units0+units1+1, sizeof(digit));
#endif
for (u32 i =0; i < 2; i++)
free(trees[i]);
}
u32 slotsize(const u32 r) const {
return 1 + htunits(hashsize(r&1));
}
// size (in htunits) of bucket in round 0 <= r < WK
u32 bucketsize(const u32 r) const {
return NSLOTS * slotsize(r);
}
htunit *getbucket(u32 r, u32 bid) const {
#ifdef JOINHT
return &trees[r][bid * bucketsize(r)];
#else
return trees[r][bid];
#endif
}
void *alloc(const u32 n, const u32 sz) {
void *mem = calloc(n, sz);
assert(mem);
alloced += (u64)n * sz;
alloced += n * sz;
return mem;
}
void dealloc(void *mem, const u32 n, const u32 sz) {
free(mem);
alloced -= (u64)n * sz;
}
};
typedef au32 bsizes[NBUCKETS];
@ -249,8 +214,8 @@ struct equi {
const htunit *bt = hta.getbucket(--r,t.bucketid);
const u32 size = 1 << r;
u32 *indices1 = indices + size;
listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
listindices(r, bt[t.slotid0 * hta.slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * hta.slotsize(r)].attr, indices1);
if (*indices > *indices1) {
for (u32 i=0; i < size; i++) {
const u32 tmp = indices[i];
@ -292,7 +257,7 @@ struct equi {
printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
#endif
}
printf(" %ld MB\n", hta.alloced >> 20);
printf("\n");
#endif
}
@ -300,65 +265,47 @@ struct equi {
htalloc hta;
u32 prevhtunits;
u32 nexthtunits;
u32 prevslotunits;
u32 nextslotunits;
u32 dunits;
u32 prevbo;
u32 nextbo;
htunit *buck;
htunit *hashbase;
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) {
u32 nexthashbytes = hashsize(r);
nexthtunits = htunits(nexthashbytes);
nextslotunits = 1 + htunits(hashsize(r&1));
prevbo = 0;
nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
if (r) {
u32 prevhashbytes = hashsize(r-1);
prevhtunits = htunits(prevhashbytes);
prevslotunits = 1 + htunits(hashsize((r-1)&1));
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
dunits = prevhtunits - nexthtunits;
}
#ifdef JOINHT
nexthtunits++;
prevhtunits++;
#endif
}
void setbucket(u32 r, u32 bid) {
buck = hta.getbucket(r, bid);
#ifdef JOINHT
hashbase = buck + 1;
#else
hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
#endif
}
u32 getxhash(const u32 slot, const htunit *hash) const {
#ifdef XWITHASH
return hash->bytes[prevbo] & 0xf;
#elif defined JOINHT
return buck[slot * prevhtunits].attr.xhash;
#else
return buck[slot].attr.xhash;
#endif
}
u32 prevhashunits() const {
#ifdef JOINHT
return prevhtunits - 1;
#else
return prevhtunits;
return buck[slot * prevslotunits].attr.xhash;
#endif
}
bool equal(const htunit *hash0, const htunit *hash1) const {
return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash;
}
htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
htunit *buck = hta.getbucket(r,bid);
#ifdef JOINHT
htunit *slotree = buck + slot * nexthtunits;
htunit *slotree = buck + slot * nextslotunits;
slotree->attr = t;
return slotree + 1;
#else
buck[slot].attr = t;
return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
#endif
}
};
@ -467,14 +414,14 @@ struct equi {
htl.setbucket(r-1, bucketid);
u32 bsize = getnslots(r-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
xfull++;
continue;
}
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
if (htl.equal(hash0, hash1)) {
hfull++;
continue;
@ -511,7 +458,7 @@ struct equi {
xort.xhash = xhash;
#endif
htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
}
}
@ -526,12 +473,12 @@ struct equi {
htl.setbucket(WK-1, bucketid);
u32 bsize = getnslots(WK-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
if (htl.equal(hash0, hash1)) {
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;