diff --git a/README.md b/README.md
index e56b79a..a3d76b8 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ Performance summary (on 4GHz i7-4790K and NVidia GTX980):
 - 8 x eqavx21: 20.3 Sol/s
 - 8 x dev1:    20.6 Sol/s
 
-- eqcuda:      23.6 Sol/s
+- eqcuda:      24.7 Sol/s
 
 And now, for something completely different: (144,5) taking 2.6 GB of memory
 
diff --git a/blake2-avx2/blake2bip.c b/blake2-avx2/blake2bip.c
index 2abb7ea..e8288de 100644
--- a/blake2-avx2/blake2bip.c
+++ b/blake2-avx2/blake2bip.c
@@ -303,7 +303,7 @@ ALIGN(64) static const uint32_t indices[12][16] = {
   }                                                                     \
 } while(0)
 
-void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
+void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) {
   __m256i v[16], s[8], iv[8], w[16], counter, flag;
   uint32_t b, i, r;
 
diff --git a/blake2-avx2/blake2bip.h b/blake2-avx2/blake2bip.h
index 11799c6..a046a5e 100644
--- a/blake2-avx2/blake2bip.h
+++ b/blake2-avx2/blake2bip.h
@@ -6,6 +6,7 @@
 typedef uint32_t u32;
 typedef unsigned char uchar;
 
-void blake2bip_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx);
+void blake2bx4_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx);
+void blake2bx8_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx);
 
 #endif
diff --git a/equi_miner.cu b/equi_miner.cu
index 96b9d26..60ed138 100644
--- a/equi_miner.cu
+++ b/equi_miner.cu
@@ -194,7 +194,7 @@ struct equi {
     nslot = 0;
     return n;
   }
-  __device__ void orderindices(u32 *indices, u32 size) {
+  __device__ bool orderindices(u32 *indices, u32 size) {
     if (indices[0] > indices[size]) {
       for (u32 i=0; i < size; i++) {
         const u32 tmp = indices[i];
@@ -202,84 +202,84 @@ struct equi {
         indices[size+i] = tmp;
       }
     }
+    return false;
   }
-  __device__ void listindices1(const tree t, u32 *indices) {
+  __device__ bool listindices1(const tree t, u32 *indices) {
     const bucket0 &buck = hta.trees0[0][t.bucketid()];
     const u32 size = 1 << 0;
     indices[0]    = buck[t.slotid0()].attr.getindex();
     indices[size] = buck[t.slotid1()].attr.getindex();
     orderindices(indices, size);
+    return false;
   }
-  __device__ void listindices2(const tree t, u32 *indices) {
+  __device__ bool listindices2(const tree t, u32 *indices) {
     const bucket1 &buck = hta.trees1[0][t.bucketid()];
     const u32 size = 1 << 1;
-    listindices1(buck[t.slotid0()].attr, indices);
-    listindices1(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices1(buck[t.slotid0()].attr, indices) || 
+           listindices1(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices3(const tree t, u32 *indices) {
+  __device__ bool listindices3(const tree t, u32 *indices) {
     const bucket0 &buck = hta.trees0[1][t.bucketid()];
     const u32 size = 1 << 2;
-    listindices2(buck[t.slotid0()].attr, indices);
-    listindices2(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices2(buck[t.slotid0()].attr, indices) || 
+           listindices2(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices4(const tree t, u32 *indices) {
+  __device__ bool listindices4(const tree t, u32 *indices) {
     const bucket1 &buck = hta.trees1[1][t.bucketid()];
     const u32 size = 1 << 3;
-    listindices3(buck[t.slotid0()].attr, indices);
-    listindices3(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices3(buck[t.slotid0()].attr, indices) || 
+           listindices3(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices5(const tree t, u32 *indices) {
+  __device__ bool listindices5(const tree t, u32 *indices) {
     const bucket0 &buck = hta.trees0[2][t.bucketid()];
     const u32 size = 1 << 4;
-    listindices4(buck[t.slotid0()].attr, indices);
-    listindices4(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices4(buck[t.slotid0()].attr, indices) || 
+           listindices4(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
 
 #if WK == 9
-  __device__ void listindices6(const tree t, u32 *indices) {
+  __device__ bool listindices6(const tree t, u32 *indices) {
     const bucket1 &buck = hta.trees1[2][t.bucketid()];
     const u32 size = 1 << 5;
-    listindices5(buck[t.slotid0()].attr, indices);
-    listindices5(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices5(buck[t.slotid0()].attr, indices) || 
+           listindices5(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices7(const tree t, u32 *indices) {
+  __device__ bool listindices7(const tree t, u32 *indices) {
     const bucket0 &buck = hta.trees0[3][t.bucketid()];
     const u32 size = 1 << 6;
-    listindices6(buck[t.slotid0()].attr, indices);
-    listindices6(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices6(buck[t.slotid0()].attr, indices) || 
+           listindices6(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices8(const tree t, u32 *indices) {
+  __device__ bool listindices8(const tree t, u32 *indices) {
     const bucket1 &buck = hta.trees1[3][t.bucketid()];
     const u32 size = 1 << 7;
-    listindices7(buck[t.slotid0()].attr, indices);
-    listindices7(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices7(buck[t.slotid0()].attr, indices) || 
+           listindices7(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
-  __device__ void listindices9(const tree t, u32 *indices) {
+  __device__ bool listindices9(const tree t, u32 *indices) {
     const bucket0 &buck = hta.trees0[4][t.bucketid()];
     const u32 size = 1 << 8;
-    listindices8(buck[t.slotid0()].attr, indices);
-    listindices8(buck[t.slotid1()].attr, indices+size);
-    orderindices(indices, size);
+    return listindices8(buck[t.slotid0()].attr, indices) || 
+           listindices8(buck[t.slotid1()].attr, indices+size) ||
+           orderindices(indices, size) || indices[0] == indices[size];
   }
 #endif
   __device__ void candidate(const tree t) {
     proof prf;
 #if WK==9
-    listindices9(t, prf);
+    if (listindices9(t, prf)) return;
 #elif WK==5
-    listindices5(t, prf);
+    if (listindices5(t, prf)) return;
 #else
 #error not implemented
 #endif
-    if (probdupe(prf))
-      return;
     u32 soli = atomicAdd(&nsols, 1);
     if (soli < MAXSOLS)
 #if WK==9
@@ -317,19 +317,6 @@ struct equi {
     printf("\n");
 #endif
   }
-  // proper dupe test is a little costly on GPU, so allow false negatives
-  __device__ bool probdupe(u32 *prf) {
-    unsigned short susp[PROOFSIZE];
-    memset(susp, 0xffff, PROOFSIZE * sizeof(unsigned short));
-    for (u32 i=0; i<PROOFSIZE; i++) {
-      u32 bin = prf[i] & (PROOFSIZE-1);
-      unsigned short msb = prf[i]>>WK;
-      if (msb == susp[bin])
-        return true;
-      susp[bin] = msb;
-    }
-    return false;
-  }
   struct htlayout {
     htalloc hta;
     u32 prevhashunits;
@@ -998,6 +985,7 @@ int main(int argc, char **argv) {
     cudaEventElapsedTime(&duration, start, stop);
       printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f);
 
+    printf("%d candidate solutions\n", eq.nsols);
     u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols);
     for (s = nsols = 0; s < maxsols; s++) {
       if (duped(sols[s])) {
diff --git a/equi_miner.h b/equi_miner.h
index a11c39f..21eda46 100644
--- a/equi_miner.h
+++ b/equi_miner.h
@@ -396,52 +396,42 @@ struct equi {
   }
   // if dupes != 0, list indices in arbitrary order and return true if dupe found
   // if dupes == 0, order indices as in Wagner condition
-  bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) {
+  bool listindices0(u32 r, const tree t, u32 *indices) {
     if (r == 0) {
       u32 idx = t.getindex();
-      if (dupes) {
-      // recognize most dupes by storing last seen index
-      // with same K least significant bits in array dupes
-        u32 bin = idx & (PROOFSIZE-1);
-        if (idx == dupes[bin]) return true;
-        dupes[bin] = idx;
-      }
       *indices = idx;
       return false;
     }
     const slot1 *buck = hta.heap1[t.bucketid()];
     const u32 size = 1 << --r;
     u32 tagi = hashwords(hashsize(r));
-    return listindices1(r, buck[t.slotid0()][tagi].tag, indices,      dupes)
-        || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
-        || (!dupes && orderindices(indices, size));
+    return listindices1(r, buck[t.slotid0()][tagi].tag, indices)
+        || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size)
+        || orderindices(indices, size) || indices[0] == indices[size];
   }
   // need separate instance for accessing (differently typed) heap1
-  bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) {
+  bool listindices1(u32 r, const tree t, u32 *indices) {
     const slot0 *buck = hta.heap0[t.bucketid()];
     const u32 size = 1 << --r;
     u32 tagi = hashwords(hashsize(r));
-    return listindices0(r, buck[t.slotid0()][tagi].tag, indices,      dupes)
-        || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
-        || (!dupes && orderindices(indices, size));
+    return listindices0(r, buck[t.slotid0()][tagi].tag, indices)
+        || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size)
+        || orderindices(indices, size) || indices[0] == indices[size];
   }
   // check a candidate that resulted in 0 xor
   // add as solution, with proper subtree ordering, if it has unique indices
   void candidate(const tree t) {
-    proof prf, dupes;
-    memset(dupes, 0xffff, sizeof(proof));
-    if (listindices1(WK, t, prf, dupes)) return; // assume WK odd
-    // it survived the probable dupe test, now check fully
-    qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
-    for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
-    // and now we have ourselves a genuine solution, not yet properly ordered
+    proof prf;
+    // listindices combines index tree reconstruction with probably dupe test
+    if (listindices1(WK, t, prf) || duped(prf)) return; // assume WK odd
+    // and now we have ourselves a genuine solution
 #ifdef ATOMIC
     u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
 #else
     u32 soli = nsols++;
 #endif
-    // retrieve solution indices in correct order
-    if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd
+    // copy solution into final place
+    if (soli < MAXSOLS) memcpy(sols[soli], prf, sizeof(proof));
   }
 #endif
   // show bucket stats and, if desired, size distribution
@@ -609,7 +599,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
     blake2b_state state0 = blake_ctx;  // local copy on stack can be copied faster
     for (u32 block = id; block < NBLOCKS; block += nthreads) {
 #ifdef USE_AVX2
-      blake2bip_final(&state0, hashes, block);
+      blake2bx4_final(&state0, hashes, block);
 #else
       blake2b_state state = state0;  // make another copy since blake2b_final modifies it
       u32 leb = htole32(block);