change 2nd stage bucketsort to slot linking
This commit is contained in:
parent
7f5063ba72
commit
fc72754ded
8
Makefile
8
Makefile
|
@ -4,11 +4,11 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
|
|||
|
||||
all: equi equi1 verify test spark test1445
|
||||
|
||||
equi: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi
|
||||
equi: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
||||
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi
|
||||
|
||||
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
|
||||
equi1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
|
||||
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1
|
||||
|
||||
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
|
||||
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
|
||||
|
|
|
@ -298,32 +298,23 @@ ALIGN(64) static const uint32_t indices[12][16] = {
|
|||
} while(0)
|
||||
|
||||
void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
|
||||
__m256i v[16], s[8], iv[8], w[16]; // 16 * 32, 8 * 32
|
||||
union {
|
||||
__m256i v;
|
||||
uint64_t w[4];
|
||||
} counter;
|
||||
uint32_t b;
|
||||
size_t inlen;
|
||||
int i, r;
|
||||
__m256i v[16], s[8], iv[8], w[16], counter, flag;
|
||||
uint32_t b, i, r;
|
||||
|
||||
ALIGN(64) uint8_t buffer[4 * BLAKE2B_BLOCKBYTES]; // 4 * 128
|
||||
memset(buffer, 0, 4 * BLAKE2B_BLOCKBYTES); // zero whole buffer
|
||||
memset(buffer, 0, 4 * BLAKE2B_BLOCKBYTES);
|
||||
for (i = 0; i < 4; i++) {
|
||||
memcpy(buffer+128*i, S->buf, S->buflen);
|
||||
b = htole32(4 * blockidx + i);
|
||||
memcpy(buffer+128*i + S->buflen, &b, 4);
|
||||
}
|
||||
inlen = S->buflen + 4;
|
||||
|
||||
for(i = 0; i < 8; ++i) {
|
||||
v[i] = _mm256_set1_epi64x(S->h[i]); // initialize all 256/64 = 4 lanes
|
||||
v[i] = _mm256_set1_epi64x(S->h[i]);
|
||||
}
|
||||
|
||||
__m256i x4 = _mm256_set1_epi64x(128+inlen);
|
||||
__m256i f0 = _mm256_set1_epi64x(~0);
|
||||
|
||||
counter.v = _mm256_add_epi64(counter.v, x4);
|
||||
counter = _mm256_set1_epi64x(128 + S->buflen + 4);
|
||||
flag = _mm256_set1_epi64x(~0);
|
||||
|
||||
for(i = 0; i < 8; ++i) {
|
||||
iv[i] = v[i];
|
||||
|
@ -332,9 +323,9 @@ void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
|
|||
v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]);
|
||||
v[10] = _mm256_set1_epi64x(blake2b_IV[2]);
|
||||
v[11] = _mm256_set1_epi64x(blake2b_IV[3]);
|
||||
v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter.v);
|
||||
v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter);
|
||||
v[13] = _mm256_set1_epi64x(blake2b_IV[5]);
|
||||
v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), f0);
|
||||
v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag);
|
||||
v[15] = _mm256_set1_epi64x(blake2b_IV[7]);
|
||||
BLAKE2B_LOADMSG_V4(w, buffer);
|
||||
for(r = 0; r < 12; ++r) {
|
||||
|
@ -346,7 +337,5 @@ void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
|
|||
|
||||
BLAKE2B_UNPACK_STATE_V4(s, v);
|
||||
|
||||
// big loop obsoleted as it would if(!inlen) break
|
||||
|
||||
memcpy(out, s, 256); // instead of return blake2b_root(out, (void *)s);
|
||||
memcpy(out, s, 256);
|
||||
}
|
||||
|
|
93
dev_miner.h
93
dev_miner.h
|
@ -101,8 +101,6 @@ static const u32 SLOTRANGE = 1<<SLOTBITS;
|
|||
static const u32 SLOTMSB = 1<<(SLOTBITS-1);
|
||||
// number of slots per bucket
|
||||
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
|
||||
// number of per-xhash slots
|
||||
static const u32 XFULL = 16;
|
||||
// SLOTBITS mask
|
||||
static const u32 SLOTMASK = SLOTRANGE-1;
|
||||
// number of possible values of xhash (rest of n) bits
|
||||
|
@ -239,7 +237,6 @@ struct equi {
|
|||
proof *sols;
|
||||
au32 nsols;
|
||||
u32 nthreads;
|
||||
u32 xfull;
|
||||
u32 bfull;
|
||||
u32 hfull;
|
||||
pthread_barrier_t barry;
|
||||
|
@ -266,7 +263,7 @@ struct equi {
|
|||
Blake2PrepareMidstate4(midstate, alignheader);
|
||||
memcpy(&blake_ctx, midstate, 256);
|
||||
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
|
||||
nsols = xfull = bfull = hfull = 0;
|
||||
nsols = bfull = hfull = 0;
|
||||
}
|
||||
u32 getslot0(const u32 bucketi) {
|
||||
#ifdef ATOMIC
|
||||
|
@ -405,8 +402,8 @@ struct equi {
|
|||
}
|
||||
#endif
|
||||
void showbsizes(u32 r) {
|
||||
printf(" x%d b%d h%d\n", xfull, bfull, hfull);
|
||||
xfull = bfull = hfull = 0;
|
||||
printf(" b%d h%d\n", bfull, hfull);
|
||||
bfull = hfull = 0;
|
||||
#if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
|
||||
u32 binsizes[65];
|
||||
memset(binsizes, 0, 65 * sizeof(u32));
|
||||
|
@ -491,11 +488,10 @@ struct equi {
|
|||
#else
|
||||
typedef u16 xslot;
|
||||
#endif
|
||||
xslot nxhashslots[NRESTS];
|
||||
xslot xhashslots[NRESTS][XFULL];
|
||||
xslot *xx;
|
||||
u32 n0;
|
||||
u32 n1;
|
||||
static const xslot xnil = ~0;
|
||||
xslot xhashslots[NRESTS];
|
||||
xslot nextxhashslot[NSLOTS];
|
||||
xslot nextslot;
|
||||
#endif
|
||||
u32 s0;
|
||||
|
||||
|
@ -503,40 +499,36 @@ struct equi {
|
|||
#ifdef XBITMAP
|
||||
memset(xhashmap, 0, NRESTS * sizeof(u64));
|
||||
#else
|
||||
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
|
||||
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
|
||||
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
|
||||
#endif
|
||||
}
|
||||
bool addslot(u32 s1, u32 xh) {
|
||||
void addslot(u32 s1, u32 xh) {
|
||||
#ifdef XBITMAP
|
||||
xmap = xhashmap[xh];
|
||||
xhashmap[xh] |= (u64)1 << s1;
|
||||
s0 = -1;
|
||||
return true;
|
||||
#else
|
||||
n1 = (u32)nxhashslots[xh]++;
|
||||
if (n1 >= XFULL)
|
||||
return false;
|
||||
xx = xhashslots[xh];
|
||||
xx[n1] = s1;
|
||||
n0 = 0;
|
||||
return true;
|
||||
nextslot = xhashslots[xh];
|
||||
nextxhashslot[s1] = nextslot;
|
||||
xhashslots[xh] = s1;
|
||||
#endif
|
||||
}
|
||||
bool nextcollision() const {
|
||||
#ifdef XBITMAP
|
||||
return xmap != 0;
|
||||
#else
|
||||
return n0 < n1;
|
||||
return nextslot != xnil;
|
||||
#endif
|
||||
}
|
||||
u32 slot() {
|
||||
#ifdef XBITMAP
|
||||
const u32 ffs = __builtin_ffsll(xmap);
|
||||
s0 += ffs; xmap >>= ffs;
|
||||
return s0;
|
||||
#else
|
||||
return (u32)xx[n0++];
|
||||
nextslot = nextxhashslot[s0 = nextslot];
|
||||
#endif
|
||||
return s0;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -578,10 +570,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash0(slot1))) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htl.getxhash0(slot1));
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -627,10 +616,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash1(slot1))) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htl.getxhash1(slot1));
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -676,10 +662,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -712,10 +695,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[3])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[3]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -748,10 +728,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -783,10 +760,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[0])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[0]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -818,10 +792,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -855,10 +826,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[1])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[1]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -890,10 +858,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -924,10 +889,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[2])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[2]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -961,8 +923,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash0(slot1))) // assume WK odd
|
||||
continue;
|
||||
cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE
|
||||
|
|
|
@ -70,7 +70,7 @@ typedef u32 au32;
|
|||
#define BLAKESINPARALLEL 2
|
||||
#else
|
||||
#define BLAKESINPARALLEL 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// number of buckets
|
||||
static const u32 NBUCKETS = 1<<BUCKBITS;
|
||||
|
|
|
@ -47,7 +47,12 @@ int main(int argc, char **argv) {
|
|||
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
|
||||
assert(threads);
|
||||
equi eq(nthreads);
|
||||
printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000);
|
||||
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
|
||||
#ifdef __AVX2__
|
||||
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
|
||||
#else
|
||||
printf("; no AVX2 detected\n");
|
||||
#endif
|
||||
u32 sumnsols = 0;
|
||||
char headernonce[HEADERNONCELEN];
|
||||
u32 hdrlen = strlen(header);
|
||||
|
|
|
@ -45,8 +45,6 @@ static const u32 SLOTBITS = RESTBITS+1+1;
|
|||
static const u32 SLOTRANGE = 1<<SLOTBITS;
|
||||
// number of slots per bucket
|
||||
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
|
||||
// number of per-xhash slots
|
||||
static const u32 XFULL = 16;
|
||||
// SLOTBITS mask
|
||||
static const u32 SLOTMASK = SLOTRANGE-1;
|
||||
// number of possible values of xhash (rest of n) bits
|
||||
|
@ -399,11 +397,10 @@ struct equi {
|
|||
#else
|
||||
typedef u16 xslot;
|
||||
#endif
|
||||
xslot nxhashslots[NRESTS];
|
||||
xslot xhashslots[NRESTS][XFULL];
|
||||
xslot *xx;
|
||||
u32 n0;
|
||||
u32 n1;
|
||||
static const xslot xnil = ~0;
|
||||
xslot xhashslots[NRESTS];
|
||||
xslot nextxhashslot[NSLOTS];
|
||||
xslot nextslot;
|
||||
#endif
|
||||
u32 s0;
|
||||
|
||||
|
@ -411,7 +408,8 @@ struct equi {
|
|||
#ifdef XBITMAP
|
||||
memset(xhashmap, 0, NRESTS * sizeof(u64));
|
||||
#else
|
||||
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
|
||||
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
|
||||
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
|
||||
#endif
|
||||
}
|
||||
__device__ bool addslot(u32 s1, u32 xh) {
|
||||
|
@ -421,12 +419,9 @@ struct equi {
|
|||
s0 = ~0;
|
||||
return true;
|
||||
#else
|
||||
n1 = (u32)nxhashslots[xh]++;
|
||||
if (n1 >= XFULL)
|
||||
return false;
|
||||
xx = xhashslots[xh];
|
||||
xx[n1] = s1;
|
||||
n0 = 0;
|
||||
nextslot = xhashslots[xh];
|
||||
nextxhashslot[s1] = nextslot;
|
||||
xhashslots[xh] = s1;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
@ -434,17 +429,17 @@ struct equi {
|
|||
#ifdef XBITMAP
|
||||
return xmap != 0;
|
||||
#else
|
||||
return n0 < n1;
|
||||
return nextslot != xnil;
|
||||
#endif
|
||||
}
|
||||
__device__ u32 slot() {
|
||||
#ifdef XBITMAP
|
||||
const u32 ffs = __ffsll(xmap);
|
||||
s0 += ffs; xmap >>= ffs;
|
||||
return s0;
|
||||
#else
|
||||
return (u32)xx[n0++];
|
||||
nextslot = nextxhashslot[s0 = nextslot];
|
||||
#endif
|
||||
return s0;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
150
equi_miner.h
150
equi_miner.h
|
@ -39,6 +39,8 @@
|
|||
#include <pthread.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "blake2-avx2/blake2bip.h"
|
||||
|
||||
#if defined __builtin_bswap32 && defined __LITTLE_ENDIAN
|
||||
#undef htobe32
|
||||
#define htobe32(x) __builtin_bswap32(x)
|
||||
|
@ -51,7 +53,7 @@
|
|||
typedef uint16_t u16;
|
||||
typedef uint64_t u64;
|
||||
|
||||
// rquired for avoiding multio-threading race conflicts
|
||||
// required for avoiding multio-threading race conflicts
|
||||
#ifdef ATOMIC
|
||||
#include <atomic>
|
||||
typedef std::atomic<u32> au32;
|
||||
|
@ -94,12 +96,8 @@ static const u32 SLOTMASK = SLOTRANGE-1;
|
|||
static const u32 SLOTMSB = 1<<(SLOTBITS-1);
|
||||
// number of slots per bucket
|
||||
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
|
||||
// number of per-xhash slots
|
||||
static const u32 XFULL = 16;
|
||||
// number of possible values of RESTBITS bits
|
||||
static const u32 NRESTS = 1<<RESTBITS;
|
||||
// number of blocks of hashes extracted from single 512 bit blake2b output
|
||||
static const u32 NBLOCKS = (NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE;
|
||||
// more than 8 solutions are rare (less than one in 100000 runs)
|
||||
static const u32 MAXSOLS = 8;
|
||||
|
||||
|
@ -276,7 +274,6 @@ struct equi {
|
|||
proof *sols;
|
||||
au32 nsols;
|
||||
u32 nthreads;
|
||||
u32 xfull;
|
||||
u32 bfull;
|
||||
u32 hfull;
|
||||
pthread_barrier_t barry;
|
||||
|
@ -298,7 +295,7 @@ struct equi {
|
|||
void setheadernonce(const char *headernonce, const u32 len) {
|
||||
setheader(&blake_ctx, headernonce);
|
||||
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
|
||||
nsols = xfull = bfull = hfull = 0;
|
||||
nsols = bfull = hfull = 0;
|
||||
}
|
||||
u32 getslot0(const u32 bucketi) {
|
||||
#ifdef ATOMIC
|
||||
|
@ -437,8 +434,8 @@ struct equi {
|
|||
}
|
||||
#endif
|
||||
void showbsizes(u32 r) {
|
||||
printf(" x%d b%d h%d\n", xfull, bfull, hfull);
|
||||
xfull = bfull = hfull = 0;
|
||||
printf(" b%d h%d\n", bfull, hfull);
|
||||
bfull = hfull = 0;
|
||||
#if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
|
||||
u32 binsizes[65];
|
||||
memset(binsizes, 0, 65 * sizeof(u32));
|
||||
|
@ -523,11 +520,10 @@ struct equi {
|
|||
#else
|
||||
typedef u16 xslot;
|
||||
#endif
|
||||
xslot nxhashslots[NRESTS];
|
||||
xslot xhashslots[NRESTS][XFULL];
|
||||
xslot *xx;
|
||||
u32 n0;
|
||||
u32 n1;
|
||||
static const xslot xnil = ~0;
|
||||
xslot xhashslots[NRESTS];
|
||||
xslot nextxhashslot[NSLOTS];
|
||||
xslot nextslot;
|
||||
#endif
|
||||
u32 s0;
|
||||
|
||||
|
@ -535,75 +531,88 @@ struct equi {
|
|||
#ifdef XBITMAP
|
||||
memset(xhashmap, 0, NRESTS * sizeof(u64));
|
||||
#else
|
||||
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
|
||||
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
|
||||
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
|
||||
#endif
|
||||
}
|
||||
bool addslot(u32 s1, u32 xh) {
|
||||
void addslot(u32 s1, u32 xh) {
|
||||
#ifdef XBITMAP
|
||||
xmap = xhashmap[xh];
|
||||
xhashmap[xh] |= (u64)1 << s1;
|
||||
s0 = -1;
|
||||
return true;
|
||||
#else
|
||||
n1 = (u32)nxhashslots[xh]++;
|
||||
if (n1 >= XFULL)
|
||||
return false;
|
||||
xx = xhashslots[xh];
|
||||
xx[n1] = s1;
|
||||
n0 = 0;
|
||||
return true;
|
||||
nextslot = xhashslots[xh];
|
||||
nextxhashslot[s1] = nextslot;
|
||||
xhashslots[xh] = s1;
|
||||
#endif
|
||||
}
|
||||
bool nextcollision() const {
|
||||
#ifdef XBITMAP
|
||||
return xmap != 0;
|
||||
#else
|
||||
return n0 < n1;
|
||||
return nextslot != xnil;
|
||||
#endif
|
||||
}
|
||||
u32 slot() {
|
||||
#ifdef XBITMAP
|
||||
const u32 ffs = __builtin_ffsll(xmap);
|
||||
s0 += ffs; xmap >>= ffs;
|
||||
return s0;
|
||||
#else
|
||||
return (u32)xx[n0++];
|
||||
nextslot = nextxhashslot[s0 = nextslot];
|
||||
#endif
|
||||
return s0;
|
||||
}
|
||||
};
|
||||
|
||||
#undef __AVX2__
|
||||
#ifdef __AVX2__
|
||||
static const u32 BLAKESINPARALLEL = 4;
|
||||
#else
|
||||
static const u32 BLAKESINPARALLEL = 1;
|
||||
#endif
|
||||
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
|
||||
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
|
||||
// number of blocks of parallel blake2b calls
|
||||
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
|
||||
|
||||
void digit0(const u32 id) {
|
||||
uchar hash[HASHOUT];
|
||||
blake2b_state state0 = blake_ctx;
|
||||
htlayout htl(this, 0);
|
||||
const u32 hashbytes = hashsize(0);
|
||||
uchar hashes[BLAKESINPARALLEL * 64];
|
||||
blake2b_state state0 = blake_ctx;
|
||||
for (u32 block = id; block < NBLOCKS; block += nthreads) {
|
||||
#ifdef __AVX2__
|
||||
blake2bip_final(&state0, hashes, block);
|
||||
#else
|
||||
blake2b_state state = state0;
|
||||
u32 leb = htole32(block);
|
||||
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
|
||||
blake2b_final(&state, hash, HASHOUT);
|
||||
for (u32 i = 0; i<HASHESPERBLAKE; i++) {
|
||||
const uchar *ph = hash + i * WN/8;
|
||||
blake2b_final(&state, hashes, HASHOUT);
|
||||
#endif
|
||||
for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
|
||||
for (u32 j = 0; j<HASHESPERBLAKE; j++) {
|
||||
const uchar *ph = hashes+ i * 64 + j * WN/8;
|
||||
#if BUCKBITS == 12 && RESTBITS == 8
|
||||
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
|
||||
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
|
||||
#elif BUCKBITS == 16 && RESTBITS == 4
|
||||
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
|
||||
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
|
||||
#elif BUCKBITS == 20 && RESTBITS == 4
|
||||
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
|
||||
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
|
||||
#elif BUCKBITS == 12 && RESTBITS == 4
|
||||
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
|
||||
const u32 xhash = ph[1] & 0xf;
|
||||
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
|
||||
const u32 xhash = ph[1] & 0xf;
|
||||
#else
|
||||
#error not implemented
|
||||
#endif
|
||||
const u32 slot = getslot0(bucketid);
|
||||
if (slot >= NSLOTS) {
|
||||
bfull++;
|
||||
continue;
|
||||
const u32 slot = getslot0(bucketid);
|
||||
if (slot >= NSLOTS) {
|
||||
bfull++;
|
||||
continue;
|
||||
}
|
||||
htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits;
|
||||
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
|
||||
s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
|
||||
}
|
||||
htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits;
|
||||
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
|
||||
s->tag = tree(block * HASHESPERBLAKE + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -617,10 +626,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash0(slot1))) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htl.getxhash0(slot1));
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -666,10 +672,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash1(slot1))) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htl.getxhash1(slot1));
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -715,10 +718,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -751,10 +751,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[3])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[3]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -787,10 +784,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -822,10 +816,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[0])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[0]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -857,10 +848,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -894,10 +882,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[1])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[1]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -929,10 +914,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4)) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -963,10 +945,7 @@ struct equi {
|
|||
u32 bsize = getnslots1(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, slot1->bytes[2])) {
|
||||
xfull++;
|
||||
continue;
|
||||
}
|
||||
cd.addslot(s1, slot1->bytes[2]);
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
const htunit *slot0 = buck[s0];
|
||||
|
@ -1000,8 +979,7 @@ struct equi {
|
|||
u32 bsize = getnslots0(bucketid);
|
||||
for (u32 s1 = 0; s1 < bsize; s1++) {
|
||||
const htunit *slot1 = buck[s1];
|
||||
if (!cd.addslot(s1, htl.getxhash0(slot1))) // assume WK odd
|
||||
continue;
|
||||
cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd
|
||||
for (; cd.nextcollision(); ) {
|
||||
const u32 s0 = cd.slot();
|
||||
if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE
|
||||
|
|
Loading…
Reference in New Issue