From 33fed1c9d5e03881f1ec73366e5d2fca124d3169 Mon Sep 17 00:00:00 2001 From: John Tromp Date: Fri, 18 Nov 2016 23:50:23 -0500 Subject: [PATCH] change equi_miner to 2^10 buckets; obsolete dev_miner --- Makefile | 39 +++++---------- equi_dev_miner.h | 4 +- equi_miner.cpp | 3 ++ equi_miner.h | 122 +++++++++++++++++++++++++++++++++-------------- 4 files changed, 105 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index b7c2a18..fdce171 100644 --- a/Makefile +++ b/Makefile @@ -5,19 +5,19 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS) all: equi equi1 verify test spark test1445 equi: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi + $(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi equi1: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1 + $(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1 equix4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DNBLAKES=4 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4 + $(GPP) -mavx2 -DNBLAKES=4 -DATOMIC equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix4 equix41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DNBLAKES=4 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 + $(GPP) -mavx2 -DNBLAKES=4 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 equix81: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -mavx2 -DNBLAKES=8 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 + $(GPP) -mavx2 -DNBLAKES=8 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equix41 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g @@ -34,32 +34,17 @@ eq1445x4: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile eq1445x41: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile $(GPP) -mavx2 -DNBLAKES=4 -DRESTBITS=4 -DWN=144 -DWK=5 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eq1445x41 -dev: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile - $(GPP) -mavx2 -DATOMIC dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev +eqasm: equi.h equi_miner.h equi_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile + $(GPP) -mavx2 -DASM_BLAKE -DATOMIC equi_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o eqasm -dev1: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile - $(GPP) -mavx2 dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o dev1 - -hash1: equi.h dev_miner.h dev_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile - $(GPP) -DHASHONLY dev_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o hash1 - -cantor: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o cantor - -cantor1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DCANTOR -DRESTBITS=10 equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o cantor1 - -equidev1: equi.h equi_dev_miner.h equi_dev_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DCANTOR -DRESTBITS=10 -DUNROLL equi_dev_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equidev1 +eqasm1: equi.h equi_miner.h equi_miner.cpp blake2-asm/asm/zcblake2_avx2.o Makefile + $(GPP) -mavx2 -DASM_BLAKE equi_miner.cpp blake/blake2b.cpp blake2-asm/asm/zcblake2_avx2.o -o eqasm1 eqcuda: equi_miner.cu equi.h blake2b.cu Makefile - nvcc -DXINTREE -DUNROLL -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcuda + nvcc -DXINTREE -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcuda eqcudah: equi_miner.cu equi.h blake2b.cu Makefile - nvcc -DHIST -DXINTREE -DUNROLL -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcudah - -devcuda: dev_miner.cu equi.h blake2b.cu Makefile - nvcc -DXINTREE -DUNROLL -arch sm_35 dev_miner.cu blake/blake2b.cpp -o devcuda + nvcc -DHIST -DXINTREE -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcudah eqcuda1445: equi_miner.cu equi.h blake2b.cu Makefile nvcc -DWN=144 -DWK=5 -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcuda1445 @@ -89,4 +74,4 @@ blake2-asm/asm/zcblake2_avx2.o: make -C blake2-asm clean: - make -C blake2b clean && rm -f dev dev1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify + make -C blake2b clean && rm -f eqasm eqasm1 equi equi1 equix4 equix41 equi1g eq1445 eq14451 eq1445x4 eq1445x41 eqcuda eqcuda1445 verify diff --git a/equi_dev_miner.h b/equi_dev_miner.h index 0721b50..e5af420 100644 --- a/equi_dev_miner.h +++ b/equi_dev_miner.h @@ -62,7 +62,8 @@ typedef u32 au32; #endif #ifndef RESTBITS -#define RESTBITS 8 +#define CANTOR +#define RESTBITS 10 #endif // 2_log of number of buckets @@ -78,6 +79,7 @@ typedef u32 au32; // an expected size of at least 512 has such relatively small // standard deviation that we can reduce capacity with negligible discarding // this value reduces (200,9) memory to under 144MB +// must be under sqrt(2)/2 with -DCANTOR #define SAVEMEM 9/14 #endif #endif diff --git a/equi_miner.cpp b/equi_miner.cpp index 0613cee..6748833 100644 --- a/equi_miner.cpp +++ b/equi_miner.cpp @@ -63,6 +63,9 @@ int main(int argc, char **argv) { assert(threads); equi eq(nthreads); printf("Using %dMB of memory and %d-way blake2b\n", 1 + eq.hta.alloced / 0x100000, NBLAKES); +#ifdef ASM_BLAKE + printf("Using xenoncat's assembly blake code\n"); +#endif u32 sumnsols = 0; char headernonce[HEADERNONCELEN]; u32 hdrlen = strlen(header); diff --git a/equi_miner.h b/equi_miner.h index c76a11c..0617c90 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -41,6 +41,38 @@ #include "blake2-avx2/blake2bip.h" +#ifdef ASM_BLAKE +#ifdef NBLAKES +#if NBLAKES != 4 +#error only 4-way assembly blake +#endif +#else +#define NBLAKES 4 +#endif +#ifdef __cplusplus +extern "C" { +#endif +void Blake2PrepareMidstate4(void *midstate, uchar *input); +#ifdef __cplusplus +} +#endif +//midstate: 256 bytes of buffer for output midstate, aligned by 32 +//input: 140 bytes header, preferably aligned by 8 + +#ifdef __cplusplus +extern "C" { +#endif +void Blake2Run4(uchar *hashout, void *midstate, u32 indexctr); +#ifdef __cplusplus +} +#endif +struct blake_state { + alignas(32) uchar state[256]; +}; +#else +typedef blake2b_state blake_state; +#endif + #if defined __builtin_bswap32 && defined __LITTLE_ENDIAN #undef htobe32 #define htobe32(x) __builtin_bswap32(x) @@ -62,7 +94,8 @@ typedef u32 au32; #endif #ifndef RESTBITS -#define RESTBITS 8 +#define CANTOR +#define RESTBITS 10 #endif // 2_log of number of buckets @@ -78,6 +111,7 @@ typedef u32 au32; // an expected size of at least 512 has such relatively small // standard deviation that we can reduce capacity with negligible discarding // this value reduces (200,9) memory to under 144MB +// must be under sqrt(2)/2 with -DCANTOR #define SAVEMEM 9/14 #endif #endif @@ -107,6 +141,8 @@ struct tree { static const u32 CANTORBITS = 2*SLOTBITS-2; static const u32 CANTORMASK = (1<word) >> 20 & 0xff); + cd.addslot(s1, htobe32(slot1->word) >> 20 & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -735,7 +781,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; hfull++; continue; } - u32 xorbucketid = htobe32(slot0->word ^ slot1->word) >> 8 & BUCKMASK; + u32 xorbucketid = htobe32(slot0->word ^ slot1->word) >> 10 & BUCKMASK; const u32 xorslot = getslot1(xorbucketid); if (xorslot >= NSLOTS) { bfull++; @@ -760,7 +806,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots1(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, slot1->bytes[3]); + cd.addslot(s1, htobe32(slot1->word) & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -768,14 +814,15 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; hfull++; continue; } - u32 xorbucketid = htobe32(slot0[1].word ^ slot1[1].word) >> 20; + u32 xor1 = slot0[1].word ^ slot1[1].word; + u32 xorbucketid = htobe32(xor1) >> 22; const u32 xorslot = getslot0(xorbucketid); if (xorslot >= NSLOTS) { bfull++; continue; } htunit *xs = heaps.heap0[xorbucketid][xorslot]; - xs++->word = slot0[1].word ^ slot1[1].word; + xs++->word = xor1; u64 *x = (u64 *)xs, *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; *x++ = x0[1] ^ x1[1]; *x++ = x0[2] ^ x1[2]; @@ -793,7 +840,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots0(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff); + cd.addslot(s1, htobe32(slot1->word) >> 12 & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -801,14 +848,16 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; hfull++; continue; } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) & BUCKMASK; + u32 xor0 = slot0->word ^ slot1->word; + u32 xorbucketid = htobe32(xor0) >> 2 & BUCKMASK; const u32 xorslot = getslot1(xorbucketid); if (xorslot >= NSLOTS) { bfull++; continue; } - u64 *x = (u64 *)heaps.heap1[xorbucketid][xorslot]; - u64 *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); + htunit *xs = heaps.heap1[xorbucketid][xorslot]; + xs++->word = xor0; + u64 *x = (u64 *)xs, *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); *x++ = x0[0] ^ x1[0]; *x++ = x0[1] ^ x1[1]; ((htunit *)x)->tag = tree(bucketid, s0, s1); @@ -825,22 +874,22 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots1(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, slot1->bytes[0]); + cd.addslot(s1, (slot1->bytes[3] & 0x3) << 8 | slot1->bytes[4]); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; - if (slot0[3].word == slot1[3].word) { + if (slot0[4].word == slot1[4].word) { hfull++; continue; } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) >> 12 & BUCKMASK; + u32 xorbucketid = htobe32(slot0[1].word ^ slot1[1].word) >> 14 & BUCKMASK; const u32 xorslot = getslot0(xorbucketid); if (xorslot >= NSLOTS) { bfull++; continue; } u64 *x = (u64 *)heaps.heap0[xorbucketid][xorslot]; - u64 *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; + u64 *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); *x++ = x0[0] ^ x1[0]; *x++ = x0[1] ^ x1[1]; ((htunit *)x)->tag = tree(bucketid, s0, s1); @@ -857,7 +906,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots0(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff); + cd.addslot(s1, htobe32(slot1->word) >> 4 & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -867,7 +916,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; } u32 xor1 = slot0[1].word ^ slot1[1].word; u32 xorbucketid = (((u32)(slot0->bytes[3] ^ slot1->bytes[3]) & 0xf) - << 8) | (xor1 & 0xff); + << 6) | (xor1 >> 2 & 0x3f); const u32 xorslot = getslot1(xorbucketid); if (xorslot >= NSLOTS) { bfull++; @@ -891,7 +940,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots1(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, slot1->bytes[1]); + cd.addslot(s1, htobe32(slot1->word) >> 16 & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -899,14 +948,15 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; hfull++; continue; } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) >> 4 & BUCKMASK; + u32 xor0 = slot0->word ^ slot1->word; + u32 xorbucketid = htobe32(xor0) >> 6 & BUCKMASK; const u32 xorslot = getslot0(xorbucketid); if (xorslot >= NSLOTS) { bfull++; continue; } htunit *xs = heaps.heap0[xorbucketid][xorslot]; - xs++->word = slot0[0].word ^ slot1[0].word; + xs++->word = xor0; u64 *x = (u64 *)xs, *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); *x++ = x0[0] ^ x1[0]; ((htunit *)x)->tag = tree(bucketid, s0, s1); @@ -923,24 +973,26 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots0(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4); + cd.addslot(s1, (slot1->bytes[3] & 0x3f) << 4 | slot1->bytes[4] >> 4); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; - if (slot0[2].word == slot1[2].word) { + u32 xor2 = slot0[2].word ^ slot1[2].word; + if (!xor2) { hfull++; continue; } - u32 xorbucketid = htobe32(slot0[1].word ^ slot1[1].word) >> 16 & BUCKMASK; + u32 xor1 = slot0[1].word ^ slot1[1].word; + u32 xorbucketid = htobe32(xor1) >> 18 & BUCKMASK; const u32 xorslot = getslot1(xorbucketid); if (xorslot >= NSLOTS) { bfull++; continue; } - u64 *x = (u64 *)heaps.heap1[xorbucketid][xorslot]; - u64 *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); - *x++ = x0[0] ^ x1[0]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); + htunit *xs = heaps.heap1[xorbucketid][xorslot]; + xs++->word = xor1; + xs++->word = xor2; + xs->tag = tree(bucketid, s0, s1); } } } @@ -954,7 +1006,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; u32 bsize = getnslots1(bucketid); for (u32 s1 = 0; s1 < bsize; s1++) { const htunit *slot1 = buck[s1]; - cd.addslot(s1, slot1->bytes[2]); + cd.addslot(s1, htobe32(slot1->word) >> 8 & 0x3ff); for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; @@ -963,8 +1015,8 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; hfull++; continue; } - u32 xorbucketid = ((u32)(slot0->bytes[3] ^ slot1->bytes[3]) << 4) - | (xor1 >> 4 & 0xf); + u32 xorbucketid = ((u32)(slot0->bytes[3] ^ slot1->bytes[3]) << 2) + | (xor1 >> 6 & 0x3); const u32 xorslot = getslot0(xorbucketid); if (xorslot >= NSLOTS) { bfull++; @@ -1029,7 +1081,7 @@ void *worker(void *vp) { barrier(&eq->barry); if (tp->id == 0) eq->showbsizes(0); barrier(&eq->barry); -#if WN == 200 && WK == 9 && RESTBITS == 8 && defined UNROLL +#if WN == 200 && WK == 9 && RESTBITS == 10 eq->digit1(tp->id); barrier(&eq->barry); if (tp->id == 0) eq->showbsizes(1);