obsolete faster by speeding up low-mem versions

2016-10-16 18:49:03 -04:00 · 2016-10-16 18:49:03 -04:00 · f06ff12ed5
parent c2a84f98b9
commit f06ff12ed5
6 changed files with 65 additions and 112 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+equi
+equi1
+equi1g
+faster
+faster1
+equi965
+equi1445
+eqcuda
+eqcuda1445
+feqcuda
+verify
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -3,9 +3,7 @@ The MIT License (MIT)
 Copyright (c) 2016 John Tromp

 Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM
-https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu,
-and associated documentation files (the "Software"), to deal
+of this software, and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
--- a/20
+++ b/20
@ -2,23 +2,17 @@ OPT   = -O3
 FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread 
 GPP   = g++ -march=native -m64 -std=c++11 $(FLAGS)

-all:	equi equi1 faster faster1 verify test spark
+all:	equi equi1 verify test spark

 equi:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi

 equi1:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
+	$(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1

 equi1g:	equi.h equi_miner.h equi_miner.cpp Makefile
 	g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g

-faster:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster
-
-faster1:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1
-
 equi965:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965

@ -37,14 +31,14 @@ feqcuda:	equi_miner.cu equi.h blake2b.cu Makefile
 verify:	equi.h equi.c Makefile
 	g++ -g equi.c blake/blake2b.cpp -o verify

-bench:	equi
-	time for i in {0..9}; do ./faster -n $$i; done
+bench:	equi1
+	time ./equi1 -n 1000 -r 100

 test:	equi verify Makefile
 	time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0

-spark:	equi1
-	time ./equi1
+spark:	equi1g
+	time ./equi1g

 clean:	
-	rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
+	rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
--- a/blake2b.cu
+++ b/blake2b.cu
@ -1,5 +1,7 @@
 // Blake2-B CUDA Implementation
 // tpruvot@github July 2016
+// permission granted to use under MIT license
+// modified for use in Zcash by John Tromp September 2016

 /**
 * uint2 direct ops by c++ operator definitions
--- a/equi_miner.cpp
+++ b/equi_miner.cpp
@ -43,10 +43,11 @@ int main(int argc, char **argv) {
  printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
  if (range > 1)
    printf("-%d", nonce+range-1);
-  printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
+  printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
  thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
  assert(threads);
  equi eq(nthreads);
+  printf("Using %dMB of memory\n", eq.hta.alloced >> 20);
  u32 sumnsols = 0;
  for (int r = 0; r < range; r++) {
    eq.setnonce(header, nonce+r);
--- a/equi_miner.h
+++ b/equi_miner.h
@ -109,87 +109,52 @@ u32 htunits(u32 bytes) {
  return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
 }

-#ifdef JOINHT
-u32 slotsize(const u32 r) {
-  return 1 + htunits(hashsize(r));
-}
-// size (in htunits) of bucket in round 0 <= r < WK
-u32 bucketsize(const u32 r) {
-  return NSLOTS * slotsize(r);
-}
-#else
-u32 slotsize(const u32 r) {
-  return 1;
-}
-#endif
-
 // manages hash and tree data
 struct htalloc {
-// Defining JOINHT joins each tree with its corresponding hash,
-// so they may share a cache line. This gives a small speed
-// advantage but comes at the cost of a big memory increase
-// as hash-space can no longer be reclaimed
-#ifdef JOINHT
  htunit *trees[WK];
-#else
-  bucket *trees[WK];
-  htunit *hashes[WK];
-#endif
-  u64 alloced;
+  u32 alloced;
  htalloc() {
    alloced = 0;
  }
  void alloctrees() {
-#ifdef JOINHT
-    for (int r=0; r<WK; r++)
-      trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
-#else
 // optimize xenoncat's fixed memory layout, avoiding any waste
-// digit trees         hashes        trees
-// 0         0 A A A A A A . . . . . .
-// 1         0 A A A A A A B B B B B 1
-// 2         0 2 C C C C C B B B B B 1
-// 3         0 2 C C C C C D D D D 3 1
-// 4         0 2 4 E E E E D D D D 3 1
-// 5         0 2 4 E E E E F F F 5 3 1
-// 6         0 2 4 6 . G G F F F 5 3 1
-// 7         0 2 4 6 . G G H H 7 5 3 1
-// 8         0 2 4 6 8 . I H H 7 5 3 1
+// digit  trees  hashes  trees hashes
+// 0      0 A A A A A A   . . . . . .
+// 1      0 A A A A A A   1 B B B B B
+// 2      0 2 C C C C C   1 B B B B B
+// 3      0 2 C C C C C   1 3 D D D D
+// 4      0 2 4 E E E E   1 3 D D D D
+// 5      0 2 4 E E E E   1 3 5 F F F
+// 6      0 2 4 6 . G G   1 3 5 F F F
+// 7      0 2 4 6 . G G   1 3 5 7 H H
+// 8      0 2 4 6 8 . I   1 3 5 7 H H
    assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits
-    u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
-    digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit));
-    for (int r=0; r<WK; r++) {
-      trees[r]  = (bucket *)(heap + (r&1 ? 1+units0+units1-r/2 :   r/2));
-      hashes[r] = (htunit *)(heap + (r&1 ? 1+units0            : 1+r/2));
-    }
-#endif
+    digit *heap[2];
+    for (u32 i =0; i < 2; i++)
+      heap[i] = (digit *)alloc(1 + htunits(hashsize(i)), sizeof(digit));
+    for (int r=0; r<WK; r++)
+      trees[r]  = (htunit *)heap[r&1] + r/2;
  }
  void dealloctrees() {
-#ifdef JOINHT
-    for (int r=0; r<WK; r++)
-      dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
-#else
-    u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
-    dealloc(trees[0], 1+units0+units1+1, sizeof(digit));
-#endif
+    for (u32 i =0; i < 2; i++)
+      free(trees[i]);
+  }
+  u32 slotsize(const u32 r) const {
+    return 1 + htunits(hashsize(r&1));
+  }
+  // size (in htunits) of bucket in round 0 <= r < WK
+  u32 bucketsize(const u32 r) const {
+    return NSLOTS * slotsize(r);
  }
  htunit *getbucket(u32 r, u32 bid) const {
-#ifdef JOINHT
    return &trees[r][bid * bucketsize(r)];
-#else
-    return trees[r][bid];
-#endif
  }
  void *alloc(const u32 n, const u32 sz) {
    void *mem  = calloc(n, sz);
    assert(mem);
-    alloced += (u64)n * sz;
+    alloced += n * sz;
    return mem;
  }
-  void dealloc(void *mem, const u32 n, const u32 sz) {
-    free(mem);
-    alloced -= (u64)n * sz;
-  }
 };

 typedef au32 bsizes[NBUCKETS];
@ -249,8 +214,8 @@ struct equi {
    const htunit *bt = hta.getbucket(--r,t.bucketid);
    const u32 size = 1 << r;
    u32 *indices1 = indices + size;
-    listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
-    listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
+    listindices(r, bt[t.slotid0 * hta.slotsize(r)].attr, indices);
+    listindices(r, bt[t.slotid1 * hta.slotsize(r)].attr, indices1);
    if (*indices > *indices1) {
      for (u32 i=0; i < size; i++) {
        const u32 tmp = indices[i];
@ -292,7 +257,7 @@ struct equi {
      printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
 #endif
    }
-    printf(" %ld MB\n", hta.alloced >> 20);
+    printf("\n");
 #endif
  }

@ -300,65 +265,47 @@ struct equi {
    htalloc hta;
    u32 prevhtunits;
    u32 nexthtunits;
+    u32 prevslotunits;
+    u32 nextslotunits;
    u32 dunits;
    u32 prevbo;
    u32 nextbo;
    htunit *buck;
    htunit *hashbase;
  
-    htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
+    htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) {
      u32 nexthashbytes = hashsize(r);
      nexthtunits = htunits(nexthashbytes);
+      nextslotunits = 1 + htunits(hashsize(r&1));
      prevbo = 0;
      nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
      if (r) {
        u32 prevhashbytes = hashsize(r-1);
        prevhtunits = htunits(prevhashbytes);
+        prevslotunits = 1 + htunits(hashsize((r-1)&1));
        prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
        dunits = prevhtunits - nexthtunits;
      }
-#ifdef JOINHT
-      nexthtunits++;
-      prevhtunits++;
-#endif
    }
    void setbucket(u32 r, u32 bid) {
      buck = hta.getbucket(r, bid);
-#ifdef JOINHT
      hashbase = buck + 1;
-#else
-      hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
-#endif
    }
    u32 getxhash(const u32 slot, const htunit *hash) const {
 #ifdef XWITHASH
      return hash->bytes[prevbo] & 0xf;
-#elif defined JOINHT
-      return buck[slot * prevhtunits].attr.xhash;
 #else
-      return buck[slot].attr.xhash;
-#endif
-    }
-    u32 prevhashunits() const {
-#ifdef JOINHT
-      return prevhtunits - 1;
-#else
-      return prevhtunits;
+      return buck[slot * prevslotunits].attr.xhash;
 #endif
    }
    bool equal(const htunit *hash0, const htunit *hash1) const {
-      return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
+      return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash;
    }
    htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
      htunit *buck = hta.getbucket(r,bid);
-#ifdef JOINHT
-      htunit *slotree = buck + slot * nexthtunits;
+      htunit *slotree = buck + slot * nextslotunits;
      slotree->attr = t;
      return slotree + 1;
-#else
-      buck[slot].attr = t;
-      return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
-#endif
    }
  };

@ -467,14 +414,14 @@ struct equi {
      htl.setbucket(r-1, bucketid);
      u32 bsize = getnslots(r-1, bucketid);
      for (u32 s1 = 0; s1 < bsize; s1++) {
-        const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
+        const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
        if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
          xfull++;
          continue;
        }
        for (; cd.nextcollision(); ) {
          const u32 s0 = cd.slot();
-          const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
+          const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
          if (htl.equal(hash0, hash1)) {
            hfull++;
            continue;
@ -511,7 +458,7 @@ struct equi {
          xort.xhash = xhash;
 #endif
          htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
-          for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
+          for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
            xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
        }
      }
@ -526,12 +473,12 @@ struct equi {
      htl.setbucket(WK-1, bucketid);
      u32 bsize = getnslots(WK-1, bucketid);
      for (u32 s1 = 0; s1 < bsize; s1++) {
-        const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
+        const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
        if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
          continue;
        for (; cd.nextcollision(); ) {
          const u32 s0 = cd.slot();
-          const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
+          const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
          if (htl.equal(hash0, hash1)) {
            tree xort; xort.bucketid = bucketid;
            xort.slotid0 = s0; xort.slotid1 = s1;