From f06ff12ed518ea834fef2502a910ef78549d178c Mon Sep 17 00:00:00 2001 From: tromp Date: Sun, 16 Oct 2016 18:49:03 -0400 Subject: [PATCH] obsolete faster by speeding up low-mem versions --- .gitignore | 11 ++++ LICENSE.txt | 4 +- Makefile | 20 +++----- blake2b.cu | 2 + equi_miner.cpp | 3 +- equi_miner.h | 137 +++++++++++++++---------------------------------- 6 files changed, 65 insertions(+), 112 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bfd7593 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +equi +equi1 +equi1g +faster +faster1 +equi965 +equi1445 +eqcuda +eqcuda1445 +feqcuda +verify diff --git a/LICENSE.txt b/LICENSE.txt index c8f4269..81edbe8 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -3,9 +3,7 @@ The MIT License (MIT) Copyright (c) 2016 John Tromp Permission is hereby granted, free of charge, to any person obtaining a copy -of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM -https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu, -and associated documentation files (the "Software"), to deal +of this software, and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is diff --git a/Makefile b/Makefile index cd5fc23..9026d75 100644 --- a/Makefile +++ b/Makefile @@ -2,23 +2,17 @@ OPT = -O3 FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread GPP = g++ -march=native -m64 -std=c++11 $(FLAGS) -all: equi equi1 faster faster1 verify test spark +all: equi equi1 verify test spark equi: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi equi1: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1 + $(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g -faster: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster - -faster1: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1 - equi965: equi.h equi_miner.h equi_miner.cpp Makefile $(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965 @@ -37,14 +31,14 @@ feqcuda: equi_miner.cu equi.h blake2b.cu Makefile verify: equi.h equi.c Makefile g++ -g equi.c blake/blake2b.cpp -o verify -bench: equi - time for i in {0..9}; do ./faster -n $$i; done +bench: equi1 + time ./equi1 -n 1000 -r 100 test: equi verify Makefile time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0 -spark: equi1 - time ./equi1 +spark: equi1g + time ./equi1g clean: - rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify + rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify diff --git a/blake2b.cu b/blake2b.cu index b7a647c..c1b7922 100644 --- a/blake2b.cu +++ b/blake2b.cu @@ -1,5 +1,7 @@ // Blake2-B CUDA Implementation // tpruvot@github July 2016 +// permission granted to use under MIT license +// modified for use in Zcash by John Tromp September 2016 /** * uint2 direct ops by c++ operator definitions diff --git a/equi_miner.cpp b/equi_miner.cpp index 9539304..27786dc 100644 --- a/equi_miner.cpp +++ b/equi_miner.cpp @@ -43,10 +43,11 @@ int main(int argc, char **argv) { printf("Looking for wagner-tree on (\"%s\",%d", header, nonce); if (range > 1) printf("-%d", nonce+range-1); - printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads); + printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads); thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx)); assert(threads); equi eq(nthreads); + printf("Using %dMB of memory\n", eq.hta.alloced >> 20); u32 sumnsols = 0; for (int r = 0; r < range; r++) { eq.setnonce(header, nonce+r); diff --git a/equi_miner.h b/equi_miner.h index 471f621..1ea846f 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -109,87 +109,52 @@ u32 htunits(u32 bytes) { return (bytes + sizeof(htunit) - 1) / sizeof(htunit); } -#ifdef JOINHT -u32 slotsize(const u32 r) { - return 1 + htunits(hashsize(r)); -} -// size (in htunits) of bucket in round 0 <= r < WK -u32 bucketsize(const u32 r) { - return NSLOTS * slotsize(r); -} -#else -u32 slotsize(const u32 r) { - return 1; -} -#endif - // manages hash and tree data struct htalloc { -// Defining JOINHT joins each tree with its corresponding hash, -// so they may share a cache line. This gives a small speed -// advantage but comes at the cost of a big memory increase -// as hash-space can no longer be reclaimed -#ifdef JOINHT htunit *trees[WK]; -#else - bucket *trees[WK]; - htunit *hashes[WK]; -#endif - u64 alloced; + u32 alloced; htalloc() { alloced = 0; } void alloctrees() { -#ifdef JOINHT - for (int r=0; r= 16); // ensures hashes shorten by 1 unit every 2 digits - u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1)); - digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit)); - for (int r=0; r *indices1) { for (u32 i=0; i < size; i++) { const u32 tmp = indices[i]; @@ -292,7 +257,7 @@ struct equi { printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE); #endif } - printf(" %ld MB\n", hta.alloced >> 20); + printf("\n"); #endif } @@ -300,65 +265,47 @@ struct equi { htalloc hta; u32 prevhtunits; u32 nexthtunits; + u32 prevslotunits; + u32 nextslotunits; u32 dunits; u32 prevbo; u32 nextbo; htunit *buck; htunit *hashbase; - htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) { + htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) { u32 nexthashbytes = hashsize(r); nexthtunits = htunits(nexthashbytes); + nextslotunits = 1 + htunits(hashsize(r&1)); prevbo = 0; nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3 if (r) { u32 prevhashbytes = hashsize(r-1); prevhtunits = htunits(prevhashbytes); + prevslotunits = 1 + htunits(hashsize((r-1)&1)); prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3 dunits = prevhtunits - nexthtunits; } -#ifdef JOINHT - nexthtunits++; - prevhtunits++; -#endif } void setbucket(u32 r, u32 bid) { buck = hta.getbucket(r, bid); -#ifdef JOINHT hashbase = buck + 1; -#else - hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits; -#endif } u32 getxhash(const u32 slot, const htunit *hash) const { #ifdef XWITHASH return hash->bytes[prevbo] & 0xf; -#elif defined JOINHT - return buck[slot * prevhtunits].attr.xhash; #else - return buck[slot].attr.xhash; -#endif - } - u32 prevhashunits() const { -#ifdef JOINHT - return prevhtunits - 1; -#else - return prevhtunits; + return buck[slot * prevslotunits].attr.xhash; #endif } bool equal(const htunit *hash0, const htunit *hash1) const { - return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash; + return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash; } htunit *addtree(u32 r, tree t, u32 bid, u32 slot) { htunit *buck = hta.getbucket(r,bid); -#ifdef JOINHT - htunit *slotree = buck + slot * nexthtunits; + htunit *slotree = buck + slot * nextslotunits; slotree->attr = t; return slotree + 1; -#else - buck[slot].attr = t; - return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits; -#endif } }; @@ -467,14 +414,14 @@ struct equi { htl.setbucket(r-1, bucketid); u32 bsize = getnslots(r-1, bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits; + const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits; if (!cd.addslot(s1, htl.getxhash(s1, hash1))) { xfull++; continue; } for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); - const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits; + const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits; if (htl.equal(hash0, hash1)) { hfull++; continue; @@ -511,7 +458,7 @@ struct equi { xort.xhash = xhash; #endif htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot); - for (u32 i=htl.dunits; i < htl.prevhashunits(); i++) + for (u32 i=htl.dunits; i < htl.prevhtunits; i++) xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash; } } @@ -526,12 +473,12 @@ struct equi { htl.setbucket(WK-1, bucketid); u32 bsize = getnslots(WK-1, bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits; + const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits; if (!cd.addslot(s1, htl.getxhash(s1, hash1))) continue; for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); - const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits; + const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits; if (htl.equal(hash0, hash1)) { tree xort; xort.bucketid = bucketid; xort.slotid0 = s0; xort.slotid1 = s1;