fix Linux build, one cmake file (TODO make shared lib all in one build)

2017-01-12 21:26:59 +01:00 · 2017-01-12 21:26:59 +01:00 · af3a5c48ac
parent d56e5e8e26
commit af3a5c48ac
50 changed files with 2147 additions and 1575 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,198 @@
+project(nheqminer)
+cmake_minimum_required(VERSION 3.5)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") #  -Wall
+
+## Enable solvers here
+#### older slower
+option(USE_CPU_TROMP "USE CPU_TROMP" OFF)
+option(USE_CUDA_TROMP "USE CUDA_TROMP" OFF)
+#### faster
+option(USE_CPU_XENONCAT "USE CPU_XENONCAT" ON)
+option(USE_CUDA_DJEZO "USE CUDA_DJEZO" ON)
+
+## Add solvers here
+if (USE_CPU_TROMP)
+    add_definitions(-DUSE_CPU_TROMP)
+    message("-- USE_CPU_TROMP DEFINED")
+endif()
+if (USE_CPU_XENONCAT)
+    add_definitions(-DUSE_CPU_XENONCAT)
+    message("-- USE_CPU_XENONCAT DEFINED")
+endif()
+if (USE_CUDA_TROMP)
+    add_definitions(-DUSE_CUDA_TROMP)
+    message("-- USE_CUDA_TROMP DEFINED")
+endif()
+if (USE_CUDA_DJEZO)
+    add_definitions(-DUSE_CUDA_DJEZO)
+    message("-- USE_CUDA_DJEZO DEFINED")
+endif()
+
+
+########
+# LINUX
+if(CMAKE_COMPILER_IS_GNUCXX)
+#    # use native cpu features
+#    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -fPIC")
+#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fPIC")
+
+#    # optimizations
+#    add_definitions(-O3)
+
+    # use
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse2")
+    # optimizations
+    add_definitions(-O2)
+endif()
+
+# Common
+include_directories(${nheqminer_SOURCE_DIR}/nheqminer)
+
+# BOOST
+#find_package(Threads REQUIRED COMPONENTS)
+# compile boost staticaly
+set(Boost_USE_STATIC_LIBS ON)
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+#set(BUILD_SHARED_LIBRARIES OFF)
+#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
+find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
+
+if (Boost_FOUND)
+  # From the offical documentation:
+  # Add include directories to the build. [...] If the SYSTEM option is given,
+  # the compiler will be told the directories are meant as system include
+  # directories on some platforms (signalling this setting might achieve effects
+  # such as the compiler skipping warnings [...])."
+  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
+
+  # From the offical documentation:
+  # "Specify directories in which the linker will look for libraries. [...] Note
+  # that this command is rarely necessary. Library locations returned by
+  # find_package() and find_library() are absolute paths. Pass these absolute
+  # library file paths directly to the target_link_libraries() command. CMake
+  # will ensure the linker finds them."
+  link_directories (${Boost_LIBRARY_DIRS})
+else()
+    message("Boost_FOUND NOT FOUND")
+endif ()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../)
+
+set(SOURCE_FILES
+    # sources
+    nheqminer/amount.cpp
+    nheqminer/api.cpp
+    nheqminer/arith_uint256.cpp
+    nheqminer/crypto/sha256.cpp
+    nheqminer/json/json_spirit_reader.cpp
+    nheqminer/json/json_spirit_value.cpp
+    nheqminer/json/json_spirit_writer.cpp
+    nheqminer/libstratum/ZcashStratum.cpp
+    nheqminer/main.cpp
+    nheqminer/primitives/block.cpp
+    nheqminer/speed.cpp
+    nheqminer/uint256.cpp
+    nheqminer/utilstrencodings.cpp
+    # headers
+    nheqminer/amount.h
+    nheqminer/api.hpp
+    nheqminer/arith_uint256.h
+    nheqminer/crypto/sha256.h
+    nheqminer/hash.h
+    nheqminer/json/json_spirit.h
+    nheqminer/json/json_spirit_error_position.h
+    nheqminer/json/json_spirit_reader.h
+    nheqminer/json/json_spirit_reader_template.h
+    nheqminer/json/json_spirit_stream_reader.h
+    nheqminer/json/json_spirit_utils.h
+    nheqminer/json/json_spirit_value.h
+    nheqminer/json/json_spirit_writer.h
+    nheqminer/json/json_spirit_writer_template.h
+    nheqminer/libstratum/StratumClient.cpp
+    nheqminer/libstratum/StratumClient.h
+    nheqminer/libstratum/ZcashStratum.cpp
+    nheqminer/libstratum/ZcashStratum.h
+    nheqminer/primitives/block.h
+    nheqminer/primitives/transaction.h
+    nheqminer/script/script.h
+    nheqminer/serialize.h
+    nheqminer/speed.hpp
+    nheqminer/streams.h
+    nheqminer/support/allocators/zeroafterfree.h
+    nheqminer/tinyformat.h
+    nheqminer/uint252.h
+    nheqminer/uint256.h
+    nheqminer/utilstrencodings.h
+    nheqminer/version.h
+    nheqminer/zcash/JoinSplit.hpp
+    nheqminer/zcash/NoteEncryption.hpp
+    nheqminer/zcash/Proof.hpp
+    nheqminer/zcash/Zcash.h
+    nheqminer/SolverStub.h # just a stub
+
+    nheqminer/AvailableSolvers.h
+    nheqminer/ISolver.h
+    nheqminer/Solver.h
+    nheqminer/MinerFactory.h
+    nheqminer/MinerFactory.cpp
+
+    # make same path on windows
+    #blake shared
+    # src
+    blake2/blake2bx.cpp
+    # headers
+    blake2/blake2.h
+    blake2/blake2b-load-sse2.h
+    blake2/blake2b-load-sse41.h
+    blake2/blake2b-round.h
+    blake2/blake2-config.h
+    blake2/blake2-impl.h
+    blake2/blake2-round.h
+    )
+
+#set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES})
+set(LIBS ${LIBS} ${Boost_LIBRARIES})
+
+message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
+message("-- LIBS: ${LIBS}")
+
+if (USE_CPU_TROMP)
+    add_subdirectory(cpu_tromp)
+endif()
+if (USE_CPU_XENONCAT)
+    add_subdirectory(cpu_xenoncat)
+endif()
+if (USE_CUDA_TROMP)
+    add_subdirectory(cuda_tromp)
+endif()
+if (USE_CUDA_DJEZO)
+    add_subdirectory(cuda_djezo)
+endif()
+
+#add_subdirectory(cpu_xenoncat)
+
+ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES})
+
+#target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} )
+target_link_libraries(${PROJECT_NAME} ${CMAKE_THREAD_LIBS_INIT} ${LIBS} )
+
+# link libs
+if (USE_CPU_TROMP)
+    target_link_libraries(${PROJECT_NAME} cpu_tromp)
+endif()
+if (USE_CPU_XENONCAT)
+    add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
+    set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx1.o" )
+    add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
+    set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx2.o" )
+    target_link_libraries(${PROJECT_NAME} cpu_xenoncat xenoncat_avx1 xenoncat_avx2)
+endif()
+if (USE_CUDA_TROMP)
+    target_link_libraries(${PROJECT_NAME} cuda_tromp)
+endif()
+if (USE_CUDA_DJEZO)
+    target_link_libraries(${PROJECT_NAME} cuda_djezo)
+endif()
+
--- a/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt
+++ b/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt
@ -17,34 +17,7 @@ endif()
 # Common
 include_directories(${nheqminer_SOURCE_DIR})

-#add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-# BOOST
-#find_package(Threads REQUIRED COMPONENTS)
-# compile boost staticaly
-set(Boost_USE_STATIC_LIBS ON)
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
-#set(BUILD_SHARED_LIBRARIES OFF)
-#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
+add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)

 #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)

@ -75,6 +48,25 @@ else()
 message("CUDA NOT FOUND")
 endif()

+if (Boost_FOUND)
+  # From the offical documentation:
+  # Add include directories to the build. [...] If the SYSTEM option is given,
+  # the compiler will be told the directories are meant as system include
+  # directories on some platforms (signalling this setting might achieve effects
+  # such as the compiler skipping warnings [...])."
+  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
+
+  # From the offical documentation:
+  # "Specify directories in which the linker will look for libraries. [...] Note
+  # that this command is rarely necessary. Library locations returned by
+  # find_package() and find_library() are absolute paths. Pass these absolute
+  # library file paths directly to the target_link_libraries() command. CMake
+  # will ensure the linker finds them."
+  link_directories (${Boost_LIBRARY_DIRS})
+else()
+    message("Boost_FOUND NOT FOUND")
+endif ()
+
 ## Add solvers here
 #add_definitions(-DUSE_CPU_XENONCAT)
 #add_definitions(-DUSE_CPU_TROMP)
@ -141,11 +133,11 @@ set(SOURCE_FILES
    ../../nheqminer/SolverStub.h # just a stub


-    ../../nheqminer/AvailableSolvers.h
-    ../../nheqminer/ISolver.h
-    ../../nheqminer/Solver.h
-    ../../nheqminer/MinerFactory.h
-    ../../nheqminer/MinerFactory.cpp
+        ../../nheqminer/AvailableSolvers.h
+        ../../nheqminer/ISolver.h
+        ../../nheqminer/Solver.h
+        ../../nheqminer/MinerFactory.h
+        ../../nheqminer/MinerFactory.cpp

 #    # cpu tromp
 #    ../../cpu_tromp/blake2/blake2bx.cpp
--- a/blake2/blake2-config.h
+++ b/blake2/blake2-config.h
@ -0,0 +1,72 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+#error "This code requires at least SSE2."
+#endif
+
+#endif
+
--- a/blake2/blake2-impl.h
+++ b/blake2/blake2-impl.h
@ -0,0 +1,136 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
--- a/blake2/blake2-round.h
+++ b/blake2/blake2-round.h
@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
--- a/blake2/blake2.h
+++ b/blake2/blake2.h
@ -0,0 +1,156 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2s_constant
+  {
+    BLAKE2S_BLOCKBYTES = 64,
+    BLAKE2S_OUTBYTES   = 32,
+    BLAKE2S_KEYBYTES   = 32,
+    BLAKE2S_SALTBYTES  = 8,
+    BLAKE2S_PERSONALBYTES = 8
+  };
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2s_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint8_t  node_offset[6];// 14
+    uint8_t  node_depth;    // 15
+    uint8_t  inner_length;  // 16
+    // uint8_t  reserved[0];
+    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+  } blake2s_param;
+
+  ALIGN( 64 ) typedef struct __blake2s_state
+  {
+    uint32_t h[8];
+    uint32_t t[2];
+    uint32_t f[2];
+    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+    size_t   buflen;
+    uint8_t  last_node;
+  } blake2s_state;
+
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint16_t counter;
+    uint8_t  buflen;
+    uint8_t  lastblock;
+  } blake2b_state;
+
+  ALIGN( 64 ) typedef struct __blake2sp_state
+  {
+    blake2s_state S[8][1];
+    blake2s_state R[1];
+    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
+    size_t  buflen;
+  } blake2sp_state;
+
+  ALIGN( 64 ) typedef struct __blake2bp_state
+  {
+    blake2b_state S[4][1];
+    blake2b_state R[1];
+    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
+    size_t  buflen;
+  } blake2bp_state;
+#pragma pack(pop)
+
+  // Streaming API
+  int blake2s_init( blake2s_state *S, const uint8_t outlen );
+  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
+  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
+  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
+
+  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
--- a/blake2/blake2b-load-sse2.h
+++ b/blake2/blake2b-load-sse2.h
@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
--- a/blake2/blake2b-load-sse41.h
+++ b/blake2/blake2b-load-sse41.h
@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
--- a/blake2/blake2b-round.h
+++ b/blake2/blake2b-round.h
@ -0,0 +1,170 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-32)); \
+  row4h = _mm_roti_epi64(row4h, (-32)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-24)); \
+  row2h = _mm_roti_epi64(row2h, (-24)); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-16)); \
+  row4h = _mm_roti_epi64(row4h, (-16)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-63)); \
+  row2h = _mm_roti_epi64(row2h, (-63)); \
+ 
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
--- a/blake2/blake2bx.cpp
+++ b/blake2/blake2bx.cpp
@ -0,0 +1,346 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+
+
+ALIGN(64) static const uint64_t blake2b_IV[8] =
+{
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* init xors IV with input parameter block */
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	//blake2b_init0( S );
+	const uint8_t * v = (const uint8_t *)(blake2b_IV);
+	const uint8_t * p = (const uint8_t *)(P);
+	uint8_t * h = (uint8_t *)(S->h);
+	/* IV XOR ParamBlock */
+	memset(S, 0, sizeof(blake2b_state));
+
+	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
+
+	return 0;
+}
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int blake2b_init(blake2b_state *S, const uint8_t outlen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		0,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+	return blake2b_init_param(S, &P);
+}
+
+int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		keylen,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+
+	if (blake2b_init_param(S, &P) < 0)
+		return 0;
+
+	{
+		uint8_t block[BLAKE2B_BLOCKBYTES];
+		memset(block, 0, BLAKE2B_BLOCKBYTES);
+		memcpy(block, key, keylen);
+		blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
+{
+	__m128i row1l, row1h;
+	__m128i row2l, row2h;
+	__m128i row3l, row3h;
+	__m128i row4l, row4h;
+	__m128i b0, b1;
+	__m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+#endif
+#if defined(HAVE_SSE41)
+	const __m128i m0 = LOADU(block + 00);
+	const __m128i m1 = LOADU(block + 16);
+	const __m128i m2 = LOADU(block + 32);
+	const __m128i m3 = LOADU(block + 48);
+	const __m128i m4 = LOADU(block + 64);
+	const __m128i m5 = LOADU(block + 80);
+	const __m128i m6 = LOADU(block + 96);
+	const __m128i m7 = LOADU(block + 112);
+#else
+	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+	const uint64_t m10 = ( ( uint64_t * )block )[10];
+	const uint64_t m11 = ( ( uint64_t * )block )[11];
+	const uint64_t m12 = ( ( uint64_t * )block )[12];
+	const uint64_t m13 = ( ( uint64_t * )block )[13];
+	const uint64_t m14 = ( ( uint64_t * )block )[14];
+	const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+	row1l = LOADU(&S->h[0]);
+	row1h = LOADU(&S->h[2]);
+	row2l = LOADU(&S->h[4]);
+	row2h = LOADU(&S->h[6]);
+	row3l = LOADU(&blake2b_IV[0]);
+	row3h = LOADU(&blake2b_IV[2]);
+	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
+	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+	row1l = _mm_xor_si128(row3l, row1l);
+	row1h = _mm_xor_si128(row3h, row1h);
+	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
+	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
+	row2l = _mm_xor_si128(row4l, row2l);
+	row2h = _mm_xor_si128(row4h, row2h);
+	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
+	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
+	return 0;
+}
+
+
+int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while (inlen > 0)
+	{
+		size_t left = S->buflen;
+		size_t fill = BLAKE2B_BLOCKBYTES - left;
+
+		if (inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			in += fill;
+			inlen -= fill;
+			S->counter += BLAKE2B_BLOCKBYTES;
+			blake2b_compress(S, S->buf); // Compress
+			S->buflen = 0;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, inlen);
+			S->buflen += inlen; // not enough to compress
+			in += inlen;
+			inlen = 0;
+		}
+	}
+
+	return 0;
+}
+
+
+int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
+{
+	if (outlen > BLAKE2B_OUTBYTES)
+		return -1;
+
+	if (S->buflen > BLAKE2B_BLOCKBYTES)
+	{
+		S->counter += BLAKE2B_BLOCKBYTES;
+		blake2b_compress(S, S->buf);
+		S->buflen -= BLAKE2B_BLOCKBYTES;
+		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
+	}
+
+	S->counter += S->buflen;
+	S->lastblock = 1;
+	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+	memcpy(out, &S->h[0], outlen);
+	S->lastblock = 0;
+	return 0;
+}
+
+
+int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
+{
+	blake2b_state S[1];
+
+	/* Verify parameters */
+	if (NULL == in) return -1;
+
+	if (NULL == out) return -1;
+
+	if (NULL == key) keylen = 0;
+
+	if (keylen)
+	{
+		if (blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
+	}
+	else
+	{
+		if (blake2b_init(S, outlen) < 0) return -1;
+	}
+
+	blake2b_update(S, (const uint8_t *)in, inlen);
+	blake2b_final(S, out, outlen);
+	return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+	return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( int argc, char **argv )
+{
+	uint8_t key[BLAKE2B_KEYBYTES];
+	uint8_t buf[KAT_LENGTH];
+
+	for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
+		key[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+		buf[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+	{
+		uint8_t hash[BLAKE2B_OUTBYTES];
+		blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+
+		if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+		{
+			puts( "error" );
+			return -1;
+		}
+	}
+
+	puts( "ok" );
+	return 0;
+}
+#endif
+
+int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
+{
+	blake2b_state blake_state;
+	if (outlen <= BLAKE2B_OUTBYTES)
+	{
+		blake2b_init(&blake_state, outlen);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out, outlen);
+	}
+	else
+	{
+		uint8_t out_buffer[BLAKE2B_OUTBYTES];
+		uint8_t in_buffer[BLAKE2B_OUTBYTES];
+		blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
+		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+		out += BLAKE2B_OUTBYTES / 2;
+		uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+		while (toproduce > BLAKE2B_OUTBYTES)
+		{
+			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+			blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
+			memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+			out += BLAKE2B_OUTBYTES / 2;
+			toproduce -= BLAKE2B_OUTBYTES / 2;
+		}
+		memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+		blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
+		memcpy(out, out_buffer, toproduce);
+
+	}
+	return 0;
+}
--- a/cpu_tromp/CMakeLists.txt
+++ b/cpu_tromp/CMakeLists.txt
@ -0,0 +1,19 @@
+set(EXECUTABLE cpu_tromp)
+
+#cpu_tromp/
+file(GLOB SRC_LIST
+    cpu_tromp.cpp )
+file(GLOB HEADERS
+    cpu_tromp.hpp
+	equi.h
+	equi_miner.h
+    )
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} )
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
--- a/cpu_tromp/equi.h
+++ b/cpu_tromp/equi.h
@ -2,6 +2,7 @@
 // Equihash solver
 // Copyright (c) 2016-2016 John Tromp

+
 #include "blake2/blake2.h"
 #ifdef __APPLE__
 #include "osx_barrier.h"
@ -131,3 +132,4 @@ int verify(u32 indices[PROOFSIZE], const char *header, const u32 headerlen, cons
  uchar hash[WN/8];
  return verifyrec(&ctx, indices, hash, WK);
 }
+
--- a/cpu_xenoncat/CMakeLists.txt
+++ b/cpu_xenoncat/CMakeLists.txt
@ -0,0 +1,17 @@
+set(EXECUTABLE cpu_xenoncat)
+
+#cpu_xenoncat/
+file(GLOB SRC_LIST
+    xenoncat.cpp )
+file(GLOB HEADERS
+    cpu_xenoncat.hpp
+    )
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} )
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
--- a/cpu_xenoncat/Linux/asm/t2.bin
+++ b/cpu_xenoncat/Linux/asm/t2.bin
--- a/cpu_xenoncat/Linux/blake2b/asm/assemble.sh
+++ b/cpu_xenoncat/Linux/blake2b/asm/assemble.sh
@ -1,2 +0,0 @@
-fasm zcblake2_avx1.asm
-fasm zcblake2_avx2.asm
--- a/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm
@ -1,36 +0,0 @@
-xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
-xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
-xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
-xctrinc dd 0,2, 0,2
-
-align 32
-iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
-dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
-dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-
-s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b	;0x32=50 bytes output
-s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
-s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a	;Personalization
-s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8	;n=200, k=9
-
-iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
-dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
-iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
-
-align 32
-yctrinit dd 0,0, 0,1, 0,2, 0,3
-yctrinc dd 0,4, 0,4, 0,4, 0,4
-
-blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
-db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
-db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
-db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
-db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
-db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
-db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
-db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
-db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
-db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0
--- a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm
@ -1,349 +0,0 @@
-macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq xmm0,xmm0,xmm4
-vpaddq xmm1,xmm1,xmm5
-vpaddq xmm2,xmm2,xmm6
-vpaddq xmm3,xmm3,xmm7
-if m0<lim
-vpaddq xmm0,xmm0, xword [src+m0*16]
-end if
-if m1<lim
-vpaddq xmm1,xmm1, xword [src+m1*16]
-end if
-if m2<lim
-vpaddq xmm2,xmm2, xword [src+m2*16]
-end if
-if m3<lim
-vpaddq xmm3,xmm3, xword [src+m3*16]
-end if
-vpxor xmm12,xmm12,xmm0
-vpxor xmm13,xmm13,xmm1
-vpxor xmm14,xmm14,xmm2
-vpxor xmm15,xmm15,xmm3
-vpshufd xmm12,xmm12,0xB1
-vpshufd xmm13,xmm13,0xB1
-vpshufd xmm14,xmm14,0xB1
-vpshufd xmm15,xmm15,0xB1
-vpaddq xmm8,xmm8,xmm12
-vpaddq xmm9,xmm9,xmm13
-vpaddq xmm10,xmm10,xmm14
-vpaddq xmm11,xmm11,xmm15
-vpxor xmm4,xmm4,xmm8
-vpxor xmm5,xmm5,xmm9
-vpxor xmm6,xmm6,xmm10
-vpxor xmm7,xmm7,xmm11
-vmovdqa [rsp], xmm8
-vmovdqa xmm8, xword [xshufb_ror24]
-vpshufb xmm4,xmm4,xmm8
-vpshufb xmm5,xmm5,xmm8
-vpshufb xmm6,xmm6,xmm8
-vpshufb xmm7,xmm7,xmm8
-vmovdqa xmm8, [rsp]
-
-vpaddq xmm0,xmm0,xmm4
-vpaddq xmm1,xmm1,xmm5
-vpaddq xmm2,xmm2,xmm6
-vpaddq xmm3,xmm3,xmm7
-if m4<lim
-vpaddq xmm0,xmm0, xword [src+m4*16]
-end if
-if m5<lim
-vpaddq xmm1,xmm1, xword [src+m5*16]
-end if
-if m6<lim
-vpaddq xmm2,xmm2, xword [src+m6*16]
-end if
-if m7<lim
-vpaddq xmm3,xmm3, xword [src+m7*16]
-end if
-vpxor xmm12,xmm12,xmm0
-vpxor xmm13,xmm13,xmm1
-vpxor xmm14,xmm14,xmm2
-vpxor xmm15,xmm15,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpshufb xmm15,xmm15,xmm0
-vpaddq xmm8,xmm8,xmm12
-vpaddq xmm9,xmm9,xmm13
-vpaddq xmm10,xmm10,xmm14
-vpaddq xmm11,xmm11,xmm15
-vpxor xmm4,xmm4,xmm8
-vpxor xmm5,xmm5,xmm9
-vpxor xmm6,xmm6,xmm10
-vpxor xmm7,xmm7,xmm11
-
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-
-vmovdqa xmm0, [rsp]
-}
-
-macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-if m0<lim
-vpaddq xmm0,xmm0, xword [src+m0*16]
-end if
-if m1<lim
-vpaddq xmm1,xmm1, xword [src+m1*16]
-end if
-if m2<lim
-vpaddq xmm2,xmm2, xword [src+m2*16]
-end if
-if m3<lim
-vpaddq xmm3,xmm3, xword [src+m3*16]
-end if
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vpshufd xmm15,xmm15,0xB1
-vpshufd xmm12,xmm12,0xB1
-vpshufd xmm13,xmm13,0xB1
-vpshufd xmm14,xmm14,0xB1
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-vmovdqa [rsp], xmm10
-vmovdqa xmm10, xword [xshufb_ror24]
-vpshufb xmm5,xmm5,xmm10
-vpshufb xmm6,xmm6,xmm10
-vpshufb xmm7,xmm7,xmm10
-vpshufb xmm4,xmm4,xmm10
-vmovdqa xmm10, [rsp]
-
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-if m4<lim
-vpaddq xmm0,xmm0, xword [src+m4*16]
-end if
-if m5<lim
-vpaddq xmm1,xmm1, xword [src+m5*16]
-end if
-if m6<lim
-vpaddq xmm2,xmm2, xword [src+m6*16]
-end if
-if m7<lim
-vpaddq xmm3,xmm3, xword [src+m7*16]
-end if
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm15,xmm15,xmm0
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-
-vmovdqa xmm0, [rsp]
-}
-
-macro Blake2bRounds2 lim,src
-{
-;ROUND 0
-;hR0 0,2,4,6,1,3,5,7,lim,src
-;hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 1
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-
-;ROUND 2
-hR0 11,12,5,15,8,0,2,13,lim,src
-hR1 10,3,7,9,14,6,1,4,lim,src
-
-;ROUND 3
-hR0 7,3,13,11,9,1,12,14,lim,src
-hR1 2,5,4,15,6,10,0,8,lim,src
-
-;ROUND 4
-hR0 9,5,2,10,0,7,4,15,lim,src
-hR1 14,11,6,3,1,12,8,13,lim,src
-
-;ROUND 5
-hR0 2,6,0,8,12,10,11,3,lim,src
-hR1 4,7,15,1,13,5,14,9,lim,src
-
-;ROUND 6
-hR0 12,1,14,4,5,15,13,10,lim,src
-hR1 0,6,9,8,7,3,2,11,lim,src
-
-;ROUND 7
-hR0 13,7,12,3,11,14,1,9,lim,src
-hR1 5,15,8,2,0,4,6,10,lim,src
-
-;ROUND 8
-hR0 6,14,11,0,15,9,3,8,lim,src
-hR1 12,13,1,10,2,7,4,5,lim,src
-
-;ROUND 9
-hR0 10,8,7,1,2,4,6,5,lim,src
-hR1 15,9,3,13,11,14,12,0,lim,src
-
-;ROUND 10
-hR0 0,2,4,6,1,3,5,7,lim,src
-hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 11
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-}
-
-macro Blake2beq2of2 mids, src
-{
-vmovddup xmm0, qword [mids]
-vpaddq xmm0,xmm0, xword [src+1*16]
-vmovddup xmm12, qword [mids+0x08]
-vpxor xmm12,xmm12,xmm0
-vpshufb xmm12,xmm12, xword [xshufb_ror16]
-vmovddup xmm8, qword [mids+0x10]
-vpaddq xmm8,xmm8,xmm12
-vmovddup xmm4, qword [mids+0x18]
-vpxor xmm4,xmm4,xmm8
-vpaddq xmm2,xmm4,xmm4	;xmm2 is temp
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm2
-
-vmovddup xmm5, qword [mids+0x20]
-vpaddq xmm0,xmm0,xmm5
-vmovddup xmm1, qword [mids+0x30]
-vpxor xmm12,xmm12,xmm1
-vpshufd xmm12,xmm12,0xB1
-vmovddup xmm13, qword [mids+0x38]
-vpaddq xmm8,xmm8,xmm13
-vmovddup xmm3, qword [mids+0x60]
-vpaddq xmm3,xmm3,xmm4
-vmovddup xmm15, qword [mids+0x48]
-vpxor xmm15,xmm15,xmm0
-vpshufd xmm15,xmm15,0xB1
-vmovddup xmm11, qword [mids+0x58]
-vpaddq xmm11,xmm11,xmm12
-vmovddup xmm7, qword [mids+0x68]
-vpxor xmm7,xmm7,xmm8
-vmovddup xmm14, qword [mids+0x40]
-vpxor xmm14,xmm14,xmm3
-vpshufd xmm14,xmm14,0xB1
-vmovddup xmm10, qword [mids+0x50]
-vpaddq xmm10,xmm10,xmm15
-vmovddup xmm6, qword [mids+0x28]
-vpxor xmm6,xmm6,xmm11
-vmovddup xmm9, qword [mids+0x70]
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm4,xmm4,xmm9
-vmovdqa xmm2, xword [xshufb_ror24]	;xmm2 is temp
-vpshufb xmm5,xmm5,xmm2
-vpshufb xmm6,xmm6,xmm2
-vpshufb xmm7,xmm7,xmm2
-vpshufb xmm4,xmm4,xmm2
-vmovddup xmm2, qword [mids+0x78]
-
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm15,xmm15,xmm0
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-vmovdqa xmm0, [rsp]
-
-Blake2bRounds2 2,src
-
-vpxor xmm0, xmm0, xmm8
-vpxor xmm1, xmm1, xmm9
-vpxor xmm2, xmm2, xmm10
-vpxor xmm3, xmm3, xmm11
-vpxor xmm4, xmm4, xmm12
-vpxor xmm5, xmm5, xmm13
-vpxor xmm6, xmm6, xmm14
-;vpxor xmm7, xmm7, xmm15
-vmovddup xmm8, qword [mids+0x80]
-vmovddup xmm9, qword [mids+0x88]
-vmovddup xmm10, qword [mids+0x90]
-vmovddup xmm11, qword [mids+0x98]
-vmovddup xmm12, qword [mids+0xa0]
-vmovddup xmm13, qword [mids+0xa8]
-vmovddup xmm14, qword [mids+0xb0]
-;vmovddup xmm15, qword [mids+0xb8]
-vpxor xmm0, xmm0, xmm8
-vpxor xmm1, xmm1, xmm9
-vpxor xmm2, xmm2, xmm10
-vpxor xmm3, xmm3, xmm11
-vpxor xmm4, xmm4, xmm12
-vpxor xmm5, xmm5, xmm13
-vpxor xmm6, xmm6, xmm14
-;vpxor xmm7, xmm7, xmm15
-}
--- a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx2.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx2.asm
@ -1,350 +0,0 @@
-macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq ymm0,ymm0,ymm4
-vpaddq ymm1,ymm1,ymm5
-vpaddq ymm2,ymm2,ymm6
-vpaddq ymm3,ymm3,ymm7
-if m0<lim
-vpaddq ymm0,ymm0, yword [src+m0*32]
-end if
-if m1<lim
-vpaddq ymm1,ymm1, yword [src+m1*32]
-end if
-if m2<lim
-vpaddq ymm2,ymm2, yword [src+m2*32]
-end if
-if m3<lim
-vpaddq ymm3,ymm3, yword [src+m3*32]
-end if
-vpxor ymm12,ymm12,ymm0
-vpxor ymm13,ymm13,ymm1
-vpxor ymm14,ymm14,ymm2
-vpxor ymm15,ymm15,ymm3
-vpshufd ymm12,ymm12,0xB1
-vpshufd ymm13,ymm13,0xB1
-vpshufd ymm14,ymm14,0xB1
-vpshufd ymm15,ymm15,0xB1
-vpaddq ymm8,ymm8,ymm12
-vpaddq ymm9,ymm9,ymm13
-vpaddq ymm10,ymm10,ymm14
-vpaddq ymm11,ymm11,ymm15
-vpxor ymm4,ymm4,ymm8
-vpxor ymm5,ymm5,ymm9
-vpxor ymm6,ymm6,ymm10
-vpxor ymm7,ymm7,ymm11
-vmovdqa [rsp], ymm8
-vbroadcasti128 ymm8, xword [xshufb_ror24]
-vpshufb ymm4,ymm4,ymm8
-vpshufb ymm5,ymm5,ymm8
-vpshufb ymm6,ymm6,ymm8
-vpshufb ymm7,ymm7,ymm8
-vmovdqa ymm8, [rsp]
-
-vpaddq ymm0,ymm0,ymm4
-vpaddq ymm1,ymm1,ymm5
-vpaddq ymm2,ymm2,ymm6
-vpaddq ymm3,ymm3,ymm7
-if m4<lim
-vpaddq ymm0,ymm0, yword [src+m4*32]
-end if
-if m5<lim
-vpaddq ymm1,ymm1, yword [src+m5*32]
-end if
-if m6<lim
-vpaddq ymm2,ymm2, yword [src+m6*32]
-end if
-if m7<lim
-vpaddq ymm3,ymm3, yword [src+m7*32]
-end if
-vpxor ymm12,ymm12,ymm0
-vpxor ymm13,ymm13,ymm1
-vpxor ymm14,ymm14,ymm2
-vpxor ymm15,ymm15,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpshufb ymm15,ymm15,ymm0
-vpaddq ymm8,ymm8,ymm12
-vpaddq ymm9,ymm9,ymm13
-vpaddq ymm10,ymm10,ymm14
-vpaddq ymm11,ymm11,ymm15
-vpxor ymm4,ymm4,ymm8
-vpxor ymm5,ymm5,ymm9
-vpxor ymm6,ymm6,ymm10
-vpxor ymm7,ymm7,ymm11
-
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-
-vmovdqa ymm0, [rsp]
-}
-
-macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-if m0<lim
-vpaddq ymm0,ymm0, yword [src+m0*32]
-end if
-if m1<lim
-vpaddq ymm1,ymm1, yword [src+m1*32]
-end if
-if m2<lim
-vpaddq ymm2,ymm2, yword [src+m2*32]
-end if
-if m3<lim
-vpaddq ymm3,ymm3, yword [src+m3*32]
-end if
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vpshufd ymm15,ymm15,0xB1
-vpshufd ymm12,ymm12,0xB1
-vpshufd ymm13,ymm13,0xB1
-vpshufd ymm14,ymm14,0xB1
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-vmovdqa [rsp], ymm10
-vbroadcasti128 ymm10, xword [xshufb_ror24]
-vpshufb ymm5,ymm5,ymm10
-vpshufb ymm6,ymm6,ymm10
-vpshufb ymm7,ymm7,ymm10
-vpshufb ymm4,ymm4,ymm10
-vmovdqa ymm10, [rsp]
-
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-if m4<lim
-vpaddq ymm0,ymm0, yword [src+m4*32]
-end if
-if m5<lim
-vpaddq ymm1,ymm1, yword [src+m5*32]
-end if
-if m6<lim
-vpaddq ymm2,ymm2, yword [src+m6*32]
-end if
-if m7<lim
-vpaddq ymm3,ymm3, yword [src+m7*32]
-end if
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm15,ymm15,ymm0
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-
-vmovdqa ymm0, [rsp]
-}
-
-macro Blake2bRounds2 lim,src
-{
-;ROUND 0
-;hR0 0,2,4,6,1,3,5,7,lim,src
-;hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 1
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-
-;ROUND 2
-hR0 11,12,5,15,8,0,2,13,lim,src
-hR1 10,3,7,9,14,6,1,4,lim,src
-
-;ROUND 3
-hR0 7,3,13,11,9,1,12,14,lim,src
-hR1 2,5,4,15,6,10,0,8,lim,src
-
-;ROUND 4
-hR0 9,5,2,10,0,7,4,15,lim,src
-hR1 14,11,6,3,1,12,8,13,lim,src
-
-;ROUND 5
-hR0 2,6,0,8,12,10,11,3,lim,src
-hR1 4,7,15,1,13,5,14,9,lim,src
-
-;ROUND 6
-hR0 12,1,14,4,5,15,13,10,lim,src
-hR1 0,6,9,8,7,3,2,11,lim,src
-
-;ROUND 7
-hR0 13,7,12,3,11,14,1,9,lim,src
-hR1 5,15,8,2,0,4,6,10,lim,src
-
-;ROUND 8
-hR0 6,14,11,0,15,9,3,8,lim,src
-hR1 12,13,1,10,2,7,4,5,lim,src
-
-;ROUND 9
-hR0 10,8,7,1,2,4,6,5,lim,src
-hR1 15,9,3,13,11,14,12,0,lim,src
-
-;ROUND 10
-hR0 0,2,4,6,1,3,5,7,lim,src
-hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 11
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-}
-
-macro Blake2beq2of2 mids, src
-{
-vpbroadcastq ymm0, qword [mids]
-vpaddq ymm0,ymm0, yword [src+1*32]
-vpbroadcastq ymm12, qword [mids+0x08]
-vpxor ymm12,ymm12,ymm0
-vbroadcasti128 ymm2, xword [xshufb_ror16]	;ymm2 is temp
-vpshufb ymm12,ymm12,ymm2
-vpbroadcastq ymm8, qword [mids+0x10]
-vpaddq ymm8,ymm8,ymm12
-vpbroadcastq ymm4, qword [mids+0x18]
-vpxor ymm4,ymm4,ymm8
-vpaddq ymm2,ymm4,ymm4	;ymm2 is temp
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm2
-
-vpbroadcastq ymm5, qword [mids+0x20]
-vpaddq ymm0,ymm0,ymm5
-vpbroadcastq ymm1, qword [mids+0x30]
-vpxor ymm12,ymm12,ymm1
-vpshufd ymm12,ymm12,0xB1
-vpbroadcastq ymm13, qword [mids+0x38]
-vpaddq ymm8,ymm8,ymm13
-vpbroadcastq ymm3, qword [mids+0x60]
-vpaddq ymm3,ymm3,ymm4
-vpbroadcastq ymm15, qword [mids+0x48]
-vpxor ymm15,ymm15,ymm0
-vpshufd ymm15,ymm15,0xB1
-vpbroadcastq ymm11, qword [mids+0x58]
-vpaddq ymm11,ymm11,ymm12
-vpbroadcastq ymm7, qword [mids+0x68]
-vpxor ymm7,ymm7,ymm8
-vpbroadcastq ymm14, qword [mids+0x40]
-vpxor ymm14,ymm14,ymm3
-vpshufd ymm14,ymm14,0xB1
-vpbroadcastq ymm10, qword [mids+0x50]
-vpaddq ymm10,ymm10,ymm15
-vpbroadcastq ymm6, qword [mids+0x28]
-vpxor ymm6,ymm6,ymm11
-vpbroadcastq ymm9, qword [mids+0x70]
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm4,ymm4,ymm9
-vbroadcasti128 ymm2, xword [xshufb_ror24]	;ymm2 is temp
-vpshufb ymm5,ymm5,ymm2
-vpshufb ymm6,ymm6,ymm2
-vpshufb ymm7,ymm7,ymm2
-vpshufb ymm4,ymm4,ymm2
-vpbroadcastq ymm2, qword [mids+0x78]
-
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm15,ymm15,ymm0
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-vmovdqa ymm0, [rsp]
-
-Blake2bRounds2 2,src
-
-vpxor ymm0, ymm0, ymm8
-vpxor ymm1, ymm1, ymm9
-vpxor ymm2, ymm2, ymm10
-vpxor ymm3, ymm3, ymm11
-vpxor ymm4, ymm4, ymm12
-vpxor ymm5, ymm5, ymm13
-vpxor ymm6, ymm6, ymm14
-;vpxor ymm7, ymm7, ymm15
-vpbroadcastq ymm8, qword [mids+0x80]
-vpbroadcastq ymm9, qword [mids+0x88]
-vpbroadcastq ymm10, qword [mids+0x90]
-vpbroadcastq ymm11, qword [mids+0x98]
-vpbroadcastq ymm12, qword [mids+0xa0]
-vpbroadcastq ymm13, qword [mids+0xa8]
-vpbroadcastq ymm14, qword [mids+0xb0]
-;vpbroadcastq ymm15, qword [mids+0xb8]
-vpxor ymm0, ymm0, ymm8
-vpxor ymm1, ymm1, ymm9
-vpxor ymm2, ymm2, ymm10
-vpxor ymm3, ymm3, ymm11
-vpxor ymm4, ymm4, ymm12
-vpxor ymm5, ymm5, ymm13
-vpxor ymm6, ymm6, ymm14
-;vpxor ymm7, ymm7, ymm15
-}
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx1.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx1.asm
@ -1,39 +0,0 @@
-;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
-;hashout: hash output buffer: 2*64 bytes
-;midstate: 256 bytes from Blake2PrepareMidstate2
-;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
-
-include "macro_blake2b_avx1.asm"
-
-Blake2Run2:
-mov rax, rsp
-sub rsp, 0x28
-and rsp, -32
-mov [rsp+0x20], rax
-
-mov [rsi+0xd4], edx
-add edx, 1
-mov [rsi+0xdc], edx
-
-Blake2beq2of2 rsi, rsi+0xc0
-
-vpunpcklqdq xmm8, xmm0, xmm1
-vpunpckhqdq xmm1, xmm0, xmm1
-vpunpcklqdq xmm10, xmm2, xmm3
-vpunpckhqdq xmm3, xmm2, xmm3
-vpunpcklqdq xmm12, xmm4, xmm5
-vpunpckhqdq xmm5, xmm4, xmm5
-vpunpcklqdq xmm14, xmm6, xmm7
-vpunpckhqdq xmm7, xmm6, xmm7
-
-vmovdqa [rdi], xmm8
-vmovdqa [rdi+0x10], xmm10
-vmovdqa [rdi+0x20], xmm12
-vmovdqa [rdi+0x30], xmm14
-vmovdqa [rdi+0x40], xmm1
-vmovdqa [rdi+0x50], xmm3
-vmovdqa [rdi+0x60], xmm5
-vmovdqa [rdi+0x70], xmm7
-
-mov rsp, [rsp+0x20]
-ret
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx2.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx2.asm
@ -1,49 +0,0 @@
-;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
-;hashout: hash output buffer: 4*64 bytes
-;midstate: 256 bytes from Blake2PrepareMidstate4
-;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
-
-include "macro_blake2b_avx2.asm"
-
-Blake2Run4:
-mov rax, rsp
-sub rsp, 0x28
-and rsp, -32
-mov [rsp+0x20], rax
-
-vmovd xmm0, edx		;indexctr
-vpbroadcastd ymm0, xmm0
-vpaddd ymm0, ymm0, yword [yctrinit]
-vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
-vmovdqa yword [rsi+0xe0], ymm0
-
-Blake2beq2of2 rsi, rsi+0xc0
-
-vpunpcklqdq ymm8, ymm0, ymm1
-vpunpckhqdq ymm9, ymm0, ymm1
-vpunpcklqdq ymm10, ymm2, ymm3
-vpunpckhqdq ymm11, ymm2, ymm3
-vpunpcklqdq ymm12, ymm4, ymm5
-vpunpckhqdq ymm13, ymm4, ymm5
-vpunpcklqdq ymm14, ymm6, ymm7
-vpunpckhqdq ymm15, ymm6, ymm7
-vperm2i128 ymm0, ymm8, ymm10, 0x20
-vperm2i128 ymm1, ymm12, ymm14, 0x20
-vperm2i128 ymm2, ymm9, ymm11, 0x20
-vperm2i128 ymm3, ymm13, ymm15, 0x20
-vperm2i128 ymm4, ymm8, ymm10, 0x31
-vperm2i128 ymm5, ymm12, ymm14, 0x31
-vperm2i128 ymm6, ymm9, ymm11, 0x31
-vperm2i128 ymm7, ymm13, ymm15, 0x31
-
-vmovdqa [rdi], ymm0
-vmovdqa [rdi+0x20], ymm1
-vmovdqa [rdi+0x40], ymm2
-vmovdqa [rdi+0x60], ymm3
-vmovdqa [rdi+0x80], ymm4
-vmovdqa [rdi+0xa0], ymm5
-vmovdqa [rdi+0xc0], ymm6
-vmovdqa [rdi+0xe0], ymm7
-
-mov rsp, [rsp+0x20]
-ret
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx1.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx1.asm
@ -1,212 +0,0 @@
-;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
-;midstate: 256 bytes of buffer for output midstate, aligned by 32
-;input: 140 bytes header, preferably aligned by 8
-
-Blake2PrepareMidstate2:
-sub rsp, 0x188
-
-vmovdqa xmm10, xword [xshufb_ror24]
-vmovdqa xmm11, xword [xshufb_ror16]
-
-vmovdqa xmm0, xword [s0]
-vmovdqa xmm1, xword [s2]
-vmovdqa xmm2, xword [s4]
-vmovdqa xmm3, xword [s6]
-vmovdqa xmm4, xword [iv]
-vmovdqa xmm5, xword [iv+0x10]
-vmovdqa xmm6, xword [iv4xor128]
-vmovdqa xmm7, xword [iv4xor128+0x10]
-
-mov r8, rsp
-lea r9, [blake2sigma]
-lea r11, [blake2sigma+160]
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-_LoopEhPrepare1:
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r9, 16
-cmp r9, r11
-jb _LoopEhPrepare1
-mov r8, rsp
-call _ProcBlakeRound
-add r8, 0x80
-call _ProcBlakeRound
-
-vpxor xmm0, xmm0, xmm4
-vpxor xmm1, xmm1, xmm5
-vpxor xmm2, xmm2, xmm6
-vpxor xmm3, xmm3, xmm7
-vpxor xmm0, xmm0, xword [s0]
-vpxor xmm1, xmm1, xword [s2]
-vpxor xmm2, xmm2, xword [s4]
-vpxor xmm3, xmm3, xword [s6]
-vmovdqa xword [rdi+0x80], xmm0
-vmovdqa xword [rdi+0x90], xmm1
-vmovdqa xword [rdi+0xa0], xmm2
-vmovdqa xword [rdi+0xb0], xmm3
-vmovq xmm8, [rsi+0x80]
-vpshufd xmm4, xmm8, 0x44
-vmovdqa xword [rdi+0xc0], xmm4
-vmovd xmm4, [rsi+0x88]
-vpshufd xmm4, xmm4, 0x44
-vmovdqa xword [rdi+0xd0], xmm4
-
-;Begin second message block
-vmovdqa xmm4, xword [iv]
-vmovdqa xmm5, xword [iv+0x10]
-vmovdqa xmm6, xword [iv4xor144]
-vmovdqa xmm7, xword [iv6inverted]
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, xmm8		;xmm8[63:0]=message
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-	vmovq [rdi+0x08], xmm6	;v12
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm4, xmm4, xmm6
-	vmovq [rdi+0x10], xmm4	;v8
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpshufb xmm2, xmm2, xmm10
-	vmovq [rdi+0x18], xmm2	;v4
-vpshufb xmm3, xmm3, xmm10
-
-vpaddq xmm0, xmm0, xmm2
-	vmovq [rdi], xmm0	;v0
-vpaddq xmm1, xmm1, xmm3
-	vpextrq [rdi+0x60], xmm1, 1	;v3
-;add message (nonce, index) to xmm0 here, but we don't have
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm6, xmm6, xmm11
-vpshufb xmm7, xmm7, xmm11
-	vmovdqa xword [rdi+0x40], xmm7	;v14,15
-vpaddq xmm4, xmm4, xmm6
-	vpextrq [rdi+0x70], xmm4, 1	;v9
-vpaddq xmm5, xmm5, xmm7
-	vmovdqa xword [rdi+0x50], xmm5	;v10,11
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-
-vpalignr xmm2, xmm3, xmm8, 8	;xmm2 resume
-	vmovdqa xword [rdi+0x20], xmm2	;v5,6
-vpsrldq xmm3, xmm3, 8
-	vmovq [rdi+0x68], xmm3		;v7
-vpsrldq xmm7, xmm6, 8
-vpaddq xmm0, xmm0, xmm2
-	vpextrq [rdi+0x30], xmm0, 1	;v1
-vpaddq xmm1, xmm1, xmm3
-	vmovq [rdi+0x78], xmm1		;v2
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm7, xmm7, 0xb1
-	vmovq [rdi+0x38], xmm7		;v13
-
-add rsp, 0x188
-ret
-
-align 16
-_ProcBlakeMsgSched:
-;rsi=src
-;r8=dst
-;r9=sigma table
-xor r10d, r10d
-_LoopBlakeMsgSched:
-movzx eax, byte [r9+r10]
-mov rax, [rsi+rax*8]
-mov [r8+r10*8], rax
-add r10d, 1
-cmp r10d, 16
-jb _LoopBlakeMsgSched
-ret
-
-align 16
-_ProcBlakeRound:
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8]
-vpaddq xmm1, xmm1, [r8+0x10]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm4, xmm4, xmm6
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpshufb xmm2, xmm2, xmm10
-vpshufb xmm3, xmm3, xmm10
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x20]
-vpaddq xmm1, xmm1, [r8+0x30]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm9, xmm6, xmm11	;xmm9 takes xmm6
-vpshufb xmm7, xmm7, xmm11
-vpaddq xmm4, xmm4, xmm9
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-
-vpalignr xmm2, xmm3, xmm8, 8	;xmm2 resume
-vpalignr xmm3, xmm8, xmm3, 8
-vpalignr xmm6, xmm9, xmm7, 8	;xmm6 resume
-vpalignr xmm7, xmm7, xmm9, 8
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x40]
-vpaddq xmm1, xmm1, [r8+0x50]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm5, xmm5, xmm6
-vpaddq xmm4, xmm4, xmm7
-vpxor xmm2, xmm2, xmm5
-vpxor xmm3, xmm3, xmm4
-vpshufb xmm2, xmm2, xmm10
-vpshufb xmm3, xmm3, xmm10
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x60]
-vpaddq xmm1, xmm1, [r8+0x70]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm9, xmm6, xmm11	;xmm9 takes xmm6
-vpshufb xmm7, xmm7, xmm11
-vpaddq xmm5, xmm5, xmm9
-vpaddq xmm4, xmm4, xmm7
-vpxor xmm2, xmm2, xmm5
-vpxor xmm3, xmm3, xmm4
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-vpalignr xmm2, xmm8, xmm3, 8	;xmm2 resume
-vpalignr xmm3, xmm3, xmm8, 8
-vpalignr xmm6, xmm7, xmm9, 8	;xmm6 resume
-vpalignr xmm7, xmm9, xmm7, 8
-ret
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx2.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx2.asm
@ -1,166 +0,0 @@
-;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
-;midstate: 256 bytes of buffer for output midstate, aligned by 32
-;input: 140 bytes header, preferably aligned by 8
-
-Blake2PrepareMidstate4:
-sub rsp, 0x188
-vbroadcasti128 ymm6, xword [xshufb_ror24]
-vbroadcasti128 ymm7, xword [xshufb_ror16]
-
-vmovdqa ymm0, yword [s0]
-vmovdqa ymm1, yword [s4]
-vmovdqa ymm2, yword [iv]
-vmovdqa ymm3, yword [iv4xor128]
-
-mov r8, rsp
-lea r9, [blake2sigma]
-lea r11, [blake2sigma+160]
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-_LoopEhPrepare1:
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r9, 16
-cmp r9, r11
-jb _LoopEhPrepare1
-mov r8, rsp
-call _ProcBlakeRound
-add r8, 0x80
-call _ProcBlakeRound
-
-vpxor ymm0, ymm0, ymm2
-vpxor ymm1, ymm1, ymm3
-vpxor ymm0, ymm0, yword [s0]
-vpxor ymm1, ymm1, yword [s4]
-vmovdqa yword [rdi+0x80], ymm0
-vmovdqa yword [rdi+0xa0], ymm1
-vmovq xmm5, [rsi+0x80]
-vpbroadcastq ymm4, xmm5
-vmovdqa yword [rdi+0xc0], ymm4
-vmovd xmm4, [rsi+0x88]
-vpbroadcastq ymm4, xmm4
-vmovdqa yword [rdi+0xe0], ymm4
-
-;Begin second message block
-vmovdqa ymm2, yword [iv]
-vmovdqa ymm3, yword [iv4xor144]	;also loads iv6inverted
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, ymm5		;ymm5[63:0]=message
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-	vmovq [rdi+0x08], xmm3	;v12
-vpaddq ymm2, ymm2, ymm3
-	vmovq [rdi+0x10], xmm2	;v8
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6
-	vmovq [rdi+0x18], xmm1	;v4
-
-vpaddq ymm0, ymm0, ymm1
-	vmovq [rdi], xmm0	;v0, v3 ready
-;add message (nonce, index) to xmm0 here, but we don't have
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7
-vextracti128 xmm4, ymm3, 1
-	vmovdqa xword [rdi+0x40], xmm4	;v14,15
-vpaddq ymm2, ymm2, ymm3
-	vpextrq [rdi+0x70], xmm2, 1	;v9
-vextracti128 xmm5, ymm2, 1
-	vmovdqa xword [rdi+0x50], xmm5	;v10,11
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-;Valid:
-;    v1  v2  v3
-;    v5  v6  v7
-;    v9  v10 v11
-;    v13 v14 v15
-;
-;v1 v2 <- v6 v7
-;v13 <- v2
-
-vpermq ymm1, ymm1, 0x39
-	vmovdqa xword [rdi+0x20], xmm1	;v5,6
-
-vextracti128 xmm4, ymm0, 1
-vextracti128 xmm5, ymm1, 1
-	vpextrq [rdi+0x60], xmm4, 1	;v3
-	vmovq [rdi+0x68], xmm5		;v7
-
-vpsrldq xmm3, xmm3, 8
-vpaddq xmm0, xmm0, xmm1
-	vpextrq [rdi+0x30], xmm0, 1	;v1
-vpaddq xmm4, xmm4, xmm5
-	vmovq [rdi+0x78], xmm4		;v2
-vpxor xmm3, xmm3, xmm4
-vpshufd xmm3, xmm3, 0xb1
-	vmovq [rdi+0x38], xmm3		;v13
-
-add rsp, 0x188
-ret
-
-align 16
-_ProcBlakeMsgSched:
-;rsi=src
-;r8=dst
-;r9=sigma table
-xor r10d, r10d
-_LoopBlakeMsgSched:
-movzx eax, byte [r9+r10]
-mov rax, [rsi+rax*8]
-mov [r8+r10*8], rax
-add r10d, 1
-cmp r10d, 16
-jb _LoopBlakeMsgSched
-ret
-
-align 16
-_ProcBlakeRound:
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8]
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6	;ror24
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x20]
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7	;ror16
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-
-vpermq ymm1, ymm1, 0x39
-vpermq ymm2, ymm2, 0x4e
-vpermq ymm3, ymm3, 0x93
-
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x40]
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6	;ror24
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x60]
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7	;ror16
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-
-vpermq ymm1, ymm1, 0x93
-vpermq ymm2, ymm2, 0x4e
-vpermq ymm3, ymm3, 0x39
-ret
--- a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx1.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx1.asm
@ -1,11 +0,0 @@
-format elf64
-public Blake2PrepareMidstate2
-public Blake2Run2
-
-section '.text' executable align 64
-include "proc_prepmidstate_avx1.asm"
-align 16
-include "proc_blake2_avx1.asm"
-
-section '.data' writeable align 64
-include "data_blake2b.asm"
--- a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx2.asm
+++ b/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx2.asm
@ -1,11 +0,0 @@
-format elf64
-public Blake2PrepareMidstate4
-public Blake2Run4
-
-section '.text' executable align 64
-include "proc_prepmidstate_avx2.asm"
-align 16
-include "proc_blake2_avx2.asm"
-
-section '.data' writeable align 64
-include "data_blake2b.asm"
--- a/cpu_xenoncat/Linux/blake2b/example_avx1.c
+++ b/cpu_xenoncat/Linux/blake2b/example_avx1.c
@ -1,51 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
-//midstate: 256 bytes of buffer for output midstate, aligned by 32
-//input: 140 bytes header, preferably aligned by 8
-
-void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
-//hashout: hash output buffer: 2*64 bytes
-//midstate: 256 bytes from Blake2PrepareMidstate2
-//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
-
-unsigned char __attribute__((aligned(8))) testdata[140] =
-{
-    0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, 
-    0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, 
-    0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, 
-    0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, 
-    0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, 
-    0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, 
-    0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
-};
-//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d  out.bin
-
-int main(void)
-{
-	unsigned char midstate_a[256+32];
-	void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
-	unsigned char hashout_a[128+32];
-	unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
-	unsigned char buf[128];
-	FILE *outfile;
-	int i;
-
-	Blake2PrepareMidstate2(pmidstate, testdata);
-	outfile = fopen("out.bin", "wb");
-
-	for (i=0; i<1048576; i+=2) {
-		Blake2Run2(phashout, pmidstate, i);
-		memcpy(buf, phashout, 50);
-		memcpy(buf+50, phashout+64, 50);
-		fwrite(buf, 100, 1, outfile);	
-	}
-
-	fclose(outfile);
-
-	return 0;
-}
--- a/cpu_xenoncat/Linux/blake2b/example_avx2.c
+++ b/cpu_xenoncat/Linux/blake2b/example_avx2.c
@ -1,53 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
-//midstate: 256 bytes of buffer for output midstate, aligned by 32
-//input: 140 bytes header, preferably aligned by 8
-
-void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
-//hashout: hash output buffer: 4*64 bytes
-//midstate: 256 bytes from Blake2PrepareMidstate4
-//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
-
-unsigned char __attribute__((aligned(8))) testdata[140] =
-{
-    0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, 
-    0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, 
-    0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, 
-    0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, 
-    0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, 
-    0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, 
-    0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
-};
-//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d  out.bin
-
-int main(void)
-{
-	unsigned char midstate_a[256+32];
-	void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
-	unsigned char hashout_a[256+32];
-	unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
-	unsigned char buf[256];
-	FILE *outfile;
-	int i;
-
-	Blake2PrepareMidstate4(pmidstate, testdata);
-	outfile = fopen("out.bin", "wb");
-
-	for (i=0; i<1048576; i+=4) {
-		Blake2Run4(phashout, pmidstate, i);
-		memcpy(buf, phashout, 50);
-		memcpy(buf+50, phashout+64, 50);
-		memcpy(buf+100, phashout+128, 50);
-		memcpy(buf+150, phashout+192, 50);
-		fwrite(buf, 200, 1, outfile);	
-	}
-
-	fclose(outfile);
-
-	return 0;
-}
--- a/cpu_xenoncat/Linux/demo/input.bin
+++ b/cpu_xenoncat/Linux/demo/input.bin
--- a/cpu_xenoncat/Linux/demo/quickbench.c
+++ b/cpu_xenoncat/Linux/demo/quickbench.c
@ -1,78 +0,0 @@
-//compile with
-//gcc -o quickbench quickbench.c equihash_avx2.o
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <time.h>
-
-#define CONTEXT_SIZE 178033152
-#define ITERATIONS 10
-
-//Linkage with assembly
-//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
-//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
-void EhPrepare(void *context, void *input);
-int32_t EhSolver(void *context, uint32_t nonce);
-extern char testinput[];
-
-int main(void)
-{
-	void *context_alloc, *context, *context_end;
-	uint32_t *pu32;
-	uint64_t *pu64, previous_rdtsc;
-	uint8_t inputheader[144];	//140 byte header
-	FILE *infile, *outfile;
-	struct timespec time0, time1;
-	long t0, t1;
-	int32_t numsolutions, total_solutions;
-	uint32_t nonce, delta_time, total_time;
-	int i, j;
-
-	context_alloc = malloc(CONTEXT_SIZE+4096);
-	context = (void*) (((long) context_alloc+4095) & -4096);
-	context_end = context + CONTEXT_SIZE;
-
-	infile = 0;
-	infile = fopen("input.bin", "rb");
-	if (infile) {
-		puts("Reading input.bin");
-		fread(inputheader, 140, 1, infile);
-		fclose(infile);
-	} else {
-		puts("input.bin not found, use sample data (beta1 testnet block 2)");
-		memcpy(inputheader, testinput, 140);
-	}
-
-
-	EhPrepare(context, (void *) inputheader);
-
-	//Warm up, timing not taken into average
-	nonce = 0;
-	clock_gettime(CLOCK_MONOTONIC, &time0);
-	numsolutions = EhSolver(context, nonce);
-	clock_gettime(CLOCK_MONOTONIC, &time1);
-	delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
-			- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
-	printf("(Warm up) Time: %u ms, solutions: %u\n", delta_time, numsolutions);
-
-	printf("Running %d iterations...\n", ITERATIONS);
-	nonce = 58;	//arbritary number to get 19 solutions in 10 iterations (to match 1.88 solutions per run)
-	total_time = total_solutions = 0;
-	for (i=0; i<ITERATIONS; i++) {
-		clock_gettime(CLOCK_MONOTONIC, &time0);
-		numsolutions = EhSolver(context, nonce);
-		clock_gettime(CLOCK_MONOTONIC, &time1);
-		nonce++;
-		delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
-				- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
-		total_time += delta_time;
-		total_solutions += numsolutions;
-		printf("Time: %u ms, solutions: %u\n", delta_time, numsolutions);
-	}
-
-	printf("Average time: %d ms; %.3f Sol/s\n", total_time/ITERATIONS, (double) 1000.0*total_solutions/total_time);
-
-	free(context_alloc);
-	return 0;
-}
--- a/cpu_xenoncat/Linux/demo/solver.c
+++ b/cpu_xenoncat/Linux/demo/solver.c
@ -1,128 +0,0 @@
-//compile with
-//gcc -o solver solver.c equihash_avx2.o
-//
-//./solver
-//sha256sum out2.bin
-//Expected result with default input.bin (beta1 testnet block 2),
-//257d3c3250c14978614ac169edcf72bd131a2e4c227c8d7e21a2cd6131a13dda  out2.bin
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-#include <time.h>
-#include <x86intrin.h>	//for rdtsc
-
-#define CONTEXT_SIZE 178033152
-
-//Linkage with assembly
-//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
-//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
-void EhPrepare(void *context, void *input);
-int32_t EhSolver(void *context, uint32_t nonce);
-extern char testinput[];
-
-//context is the memory used for Equihash computation. It should be allocated outside of SolverFunction, the size is defined by CONTEXT_SIZE, about 180MB.
-//SolverFunction API has slight overhead in mining due to missing opportunity to run EhSolver multiple times after a single EhPrepare.
-int SolverFunction(void* context, const unsigned char* input,
-	bool (*validBlock)(void*, const unsigned char*),
-	void* validBlockData,
-	bool (*cancelled)(void*),
-	void* cancelledData,
-	int numThreads,
-	int n, int k)
-{
-	int numsolutions, i;
-
-	EhPrepare(context, (void *) input);
-	numsolutions = EhSolver(context, *(uint32_t *)(input+136));
-
-	for (i=0; i<numsolutions; i++) {
-		validBlock(validBlockData, (unsigned char*)(context+1344*i));
-	}
-	return numsolutions;
-}
-
-bool validBlock(void *validBlockData, const unsigned char *solution)
-{
-	return 0;
-}
-
-bool cancelled(void *cancelledData)
-{
-	return 0;
-}
-
-int main(void)
-{
-	void *context_alloc, *context, *context_end;
-	uint32_t *pu32;
-	uint64_t *pu64, previous_rdtsc;
-	uint8_t inputheader[144];	//140 byte header
-	FILE *infile, *outfile;
-	struct timespec time0, time1;
-	uint64_t rdtsc0, rdtsc1;
-	long t0, t1;
-	int32_t numsolutions;
-	int i, j;
-	char outfilename[32];
-
-	context_alloc = malloc(CONTEXT_SIZE+4096);
-	context = (void*) (((long) context_alloc+4095) & -4096);
-	context_end = context + CONTEXT_SIZE;
-
-	//Init page tables. This is not necessary, but useful to get a more consistent single-run timing.
-	for (pu32=context; (void*) pu32<context_end; pu32+=1024)
-		*pu32 = 0;
-
-	infile = 0;
-	infile = fopen("input.bin", "rb");
-	if (infile) {
-		puts("Reading input.bin");
-		fread(inputheader, 140, 1, infile);
-		fclose(infile);
-	} else {
-		puts("input.bin not found, use sample data (beta1 testnet block 2)");
-		memcpy(inputheader, testinput, 140);
-	}
-
-	puts("Running solver...");
-	clock_gettime(CLOCK_MONOTONIC, &time0);
-	rdtsc0 = __rdtsc();
-	numsolutions = SolverFunction(context, inputheader, validBlock, 0, cancelled, 0, 1, 200, 9);
-	//EhPrepare(context, (void *) inputheader);
-	//numsolutions = EhSolver(context, *(uint32_t *)(inputheader+136));
-	clock_gettime(CLOCK_MONOTONIC, &time1);
-	rdtsc1 = __rdtsc();
-
-	//Print some debug information
-	pu64 = (uint64_t *) (context + 102408);	//Read the debug area for statistics
-	printf("BLAKE2b rdtsc: %lu\n", pu64[1]-pu64[0]);
-	previous_rdtsc = pu64[1];
-	for (i=1, j=2; i<=9; i++, j+=2) {
-		printf("Stage %u, Output pairs %u, rdtsc: %lu\n", i, (uint32_t) pu64[j+1], pu64[j]-previous_rdtsc);
-		previous_rdtsc = pu64[j];
-	}
-	printf("Number of solutions before duplicate removal: %u\n", *(uint32_t *) (context+16384));
-	printf("Duplicate removal and tree expand rdtsc: %lu\n", pu64[j]-previous_rdtsc);
-
-	printf("Number of solutions: %d\n", numsolutions);
-
-	j = numsolutions < 4 ? numsolutions : 4;
-	for (i=0; i<j; i++) {
-		sprintf(outfilename, "out%d.bin", i);
-		outfile = fopen(outfilename, "wb");
-		fwrite(context+1344*i, 1344, 1, outfile);
-		fclose(outfile);
-	}
-
-	t0 = time0.tv_sec * 1000000000 + time0.tv_nsec;
-	t1 = time1.tv_sec * 1000000000 + time1.tv_nsec;
-	printf("Time: %ld ms\n", (t1-t0)/1000000);
-	t0 = (t1-t0)/1000;
-	printf("Measure rdtsc frequency = %.3f MHz\n", (double) (rdtsc1-rdtsc0)/t0);
-
-	free(context_alloc);
-	return 0;
-}
--- a/cpu_xenoncat/asm_linux/assemble.sh
+++ b/cpu_xenoncat/asm_linux/assemble.sh
--- a/cpu_xenoncat/asm_linux/data_blake2b.asm
+++ b/cpu_xenoncat/asm_linux/data_blake2b.asm
--- a/cpu_xenoncat/asm_linux/equihash_avx1.asm
+++ b/cpu_xenoncat/asm_linux/equihash_avx1.asm
@ -1,7 +1,6 @@
 format elf64
 public EhPrepare as 'EhPrepareAVX1'
 public EhSolver as 'EhSolverAVX1'
-public testinput as 'testinputAVX1'

 include "struct.inc"
 include "params.inc"
@ -14,4 +13,3 @@ include "proc_ehsolver_avx1.asm"

 section '.data' writeable align 64
 include "data_blake2b.asm"
-testinput file "t2.bin"
--- a/cpu_xenoncat/asm_linux/equihash_avx2.asm
+++ b/cpu_xenoncat/asm_linux/equihash_avx2.asm
@ -1,7 +1,6 @@
 format elf64
 public EhPrepare as 'EhPrepareAVX2'
 public EhSolver as 'EhSolverAVX2'
-public testinput as 'testinputAVX2'

 include "struct.inc"
 include "params.inc"
@ -14,4 +13,3 @@ include "proc_ehsolver_avx2.asm"

 section '.data' writeable align 64
 include "data_blake2b.asm"
-testinput file "t2.bin"
--- a/cpu_xenoncat/asm_linux/fasm
+++ b/cpu_xenoncat/asm_linux/fasm
--- a/cpu_xenoncat/asm_linux/macro_blake2b_avx1.asm
+++ b/cpu_xenoncat/asm_linux/macro_blake2b_avx1.asm
--- a/cpu_xenoncat/asm_linux/macro_blake2b_avx2.asm
+++ b/cpu_xenoncat/asm_linux/macro_blake2b_avx2.asm
--- a/cpu_xenoncat/asm_linux/macro_eh.asm
+++ b/cpu_xenoncat/asm_linux/macro_eh.asm
--- a/cpu_xenoncat/asm_linux/params.inc
+++ b/cpu_xenoncat/asm_linux/params.inc
--- a/cpu_xenoncat/asm_linux/proc_ehprepare_avx1.asm
+++ b/cpu_xenoncat/asm_linux/proc_ehprepare_avx1.asm
--- a/cpu_xenoncat/asm_linux/proc_ehprepare_avx2.asm
+++ b/cpu_xenoncat/asm_linux/proc_ehprepare_avx2.asm
--- a/cpu_xenoncat/asm_linux/proc_ehsolver_avx1.asm
+++ b/cpu_xenoncat/asm_linux/proc_ehsolver_avx1.asm
--- a/cpu_xenoncat/asm_linux/proc_ehsolver_avx2.asm
+++ b/cpu_xenoncat/asm_linux/proc_ehsolver_avx2.asm
--- a/cpu_xenoncat/asm_linux/struct.inc
+++ b/cpu_xenoncat/asm_linux/struct.inc
--- a/cpu_xenoncat/asm_linux/struct_eh.inc
+++ b/cpu_xenoncat/asm_linux/struct_eh.inc
--- a/cuda_djezo/CMakeLists.txt
+++ b/cuda_djezo/CMakeLists.txt
@ -0,0 +1,43 @@
+set(EXECUTABLE cuda_djezo)
+
+option(ENABLE_CUDA "Enable the cuda build" ON)
+
+# depending on gcc version
+# ;-std=c++11 => Ubuntu 14.04 check gcc versions
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
+
+file(GLOB SRC_LIST
+    cuda_djezo.cpp
+    equi_miner.cu )
+file(GLOB HEADERS
+    cuda_djezo.hpp
+    eqcuda.hpp
+    )
+
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-D_FORCE_INLINES;--disable-warnings;--ptxas-options=-v;-Xptxas=-dlcm=ca;-Xptxas=-dscm=cs; -O3)
+
+FIND_PACKAGE(CUDA REQUIRED)
+if(COMPUTE AND (COMPUTE GREATER 0))
+        LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE AND (COMPUTE GREATER 0))
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_60,code=sm_60 )
+endif(COMPUTE AND (COMPUTE GREATER 0))
+
+if(CUDA_FOUND)
+message("CUDA FOUND")
+else()
+message("CUDA NOT FOUND")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} cuda)
+
+message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
--- a/cuda_djezo/blake2b.cu
+++ b/cuda_djezo/blake2b.cu
@ -0,0 +1,336 @@
+// Blake2-B CUDA Implementation
+// tpruvot@github July 2016
+// permission granted to use under MIT license
+// modified for use in Zcash by John Tromp September 2016
+
+/**
+ * uint2 direct ops by c++ operator definitions
+ */
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
+  return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { 
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); 
+}
+// uint2 ROR/ROL methods
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
+  uint2 result;
+#if __CUDA_ARCH__ > 300
+/*  if (offset < 32) {
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+  } else *//* if (offset < 64) */ {
+          /* offset SHOULD BE < 64 ! */
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+  }
+#else
+  if (!offset)
+          result = a;
+  else if (offset < 32) {
+          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
+          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
+  } else if (offset == 32) {
+          result.y = a.x;
+          result.x = a.y;
+  } else {
+          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+  }
+#endif
+  return result;
+}
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
+  return make_uint2(value.y, value.x);
+}
+#ifdef __CUDA_ARCH__
+__device__ __inline__ uint2 ROR24(const uint2 a) {
+  uint2 result;
+  result.x = __byte_perm(a.y, a.x, 0x2107);
+  result.y = __byte_perm(a.y, a.x, 0x6543);
+  return result;
+}
+__device__ __inline__ uint2 ROR16(const uint2 a) {
+  uint2 result;
+  result.x = __byte_perm(a.y, a.x, 0x1076);
+  result.y = __byte_perm(a.y, a.x, 0x5432);
+  return result;
+}
+#else
+#define ROR24(u) ROR2(u,24)
+#define ROR16(u) ROR2(u,16)
+#endif
+
+typedef uint64_t u64;
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+__device__ __constant__
+static const u64 blake_iv[] = 
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+__device__ __forceinline__
+static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
+  a = a + b + m[ blake2b_sigma[r][2*i] ];
+  ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+  c = c + d;
+  ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+  a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+  ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+  c = c + d;
+  ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+//__device__ __forceinline__
+//static void G2(u64 &a, u64 &b, u64 &c, u64 &d, u64 x, u64 y) {
+//	a = a + b + x;
+//	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+//	c = c + d;
+//	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+//	a = a + b + y;
+//	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+//	c = c + d;
+//	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+//}
+
+__device__ __forceinline__
+static void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) {
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+  G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+  G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+  G(r, 2, v[2], v[6], v[10], v[14], m); \
+  G(r, 3, v[3], v[7], v[11], v[15], m); \
+  G(r, 4, v[0], v[5], v[10], v[15], m); \
+  G(r, 5, v[1], v[6], v[11], v[12], m); \
+  G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+  G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+
+__forceinline__ __device__ void blake2b_gpu_hash3(uint64_t* h, u32 idx, u32 nonce) {
+	u64 m = (u64)idx << 32 | (u64)nonce;
+
+	u64 v[16];
+
+	v[0] = h[0];
+	v[1] = h[1];
+	v[2] = h[2];
+	v[3] = h[3];
+	v[4] = h[4];
+	v[5] = h[5];
+	v[6] = h[6];
+	v[7] = h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+	
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+}
+
+
+__forceinline__ __device__ void blake2b_gpu_hash2(uint64_t* h, u32 idx) {
+	u64 m[16] = { 0 };
+	u32* ptr = (u32*)&m[1];
+
+	ptr[1] = idx;
+
+	u64 v[16];
+
+	v[0] = h[0];
+	v[1] = h[1];
+	v[2] = h[2];
+	v[3] = h[3];
+	v[4] = h[4];
+	v[5] = h[5];
+	v[6] = h[6];
+	v[7] = h[7];
+	v[8] = 0x6a09e667f3bcc908;
+	v[9] = 0xbb67ae8584caa73b;
+	v[10] = 0x3c6ef372fe94f82b;
+	v[11] = 0xa54ff53a5f1d36f1;
+	v[12] = 0x510e527fade682d1 ^ (128 + 16);
+	v[13] = 0x9b05688c2b3e6c1f;
+	v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
+	v[15] = 0x5be0cd19137e2179;
+
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+	//h[7] ^= v[7] ^ v[15];
+	//memcpy(hash, (uchar *)h, outlen);
+}
--- a/cuda_djezo/equi_miner.cu
+++ b/cuda_djezo/equi_miner.cu
@ -2156,4 +2156,4 @@ template class eq_cuda_context<CONFIG_MODE_2>;

 #ifdef CONFIG_MODE_3
 template class eq_cuda_context<CONFIG_MODE_3>;
-#endif
+#endif
--- a/cuda_tromp/CMakeLists.txt
+++ b/cuda_tromp/CMakeLists.txt
@ -0,0 +1,57 @@
+set(EXECUTABLE cuda_tromp)
+
+option(ENABLE_CUDA "Enable the cuda build" ON)
+
+# depending on gcc version
+# ;-std=c++11 => Ubuntu 14.04 check gcc versions
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
+
+file(GLOB SRC_LIST
+    cuda_tromp.cpp
+    equi_miner.cu )
+file(GLOB HEADERS
+    cuda_tromp.hpp
+    eqcuda.hpp
+    )
+
+
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+add_definitions(-DHIST)
+#add_definitions(-DXINTREE)
+#add_definitions(-DUNROLL)
+
+list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+
+
+FIND_PACKAGE(CUDA REQUIRED)
+if(COMPUTE AND (COMPUTE GREATER 0))
+        LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE AND (COMPUTE GREATER 0))
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 )
+endif(COMPUTE AND (COMPUTE GREATER 0))
+
+include_directories(${CUDA_INCLUDE_DIRS})
+
+find_package(Threads REQUIRED COMPONENTS)
+find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
+
+if(CUDA_FOUND)
+message("CUDA FOUND")
+else()
+message("CUDA NOT FOUND")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES})
+
+message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
--- a/nheqminer/MinerFactory.cpp
+++ b/nheqminer/MinerFactory.cpp
@ -56,6 +56,8 @@ void MinerFactory::ClearAllSolvers() {
 }

 ISolver * MinerFactory::GenCPUSolver(int use_opt) {
+    // TODO fix dynamic linking on Linux
+#ifdef    USE_CPU_XENONCAT
 	if (_use_xenoncat) {
 		_solvers.push_back(new CPUSolverXenoncat(use_opt));
 		return _solvers.back();
@ -63,6 +65,10 @@ ISolver * MinerFactory::GenCPUSolver(int use_opt) {
 		_solvers.push_back(new CPUSolverTromp(use_opt));
 		return _solvers.back();
 	}
+#else
+    _solvers.push_back(new CPUSolverTromp(use_opt));
+    return _solvers.back();
+#endif
 }

 ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperblock) {
@ -75,7 +81,7 @@ ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperbloc
 		return _solvers.back();
 	}
 }
-
+// no OpenCL solvers at the moment keep for future reference
 ISolver * MinerFactory::GenOPENCLSolver(int platf_id, int dev_id) {
 	if (_use_silentarmy) {
 		_solvers.push_back(new OPENCLSolverSilentarmy(platf_id, dev_id));
--- a/nheqminer/main.cpp
+++ b/nheqminer/main.cpp
@ -108,7 +108,12 @@ void print_help()

 void print_cuda_info()
 {
-	int num_devices = cuda_tromp::getcount();
+#if defined(USE_CUDA_DJEZO) || defined(USE_CUDA_TROMP)
+#ifdef USE_CUDA_DJEZO
+    int num_devices = cuda_djezo::getcount();
+#elif USE_CUDA_TROMP
+    int num_devices = cuda_tromp::getcount();
+#endif

 	std::cout << "Number of CUDA devices found: " << num_devices << std::endl;

@ -123,6 +128,7 @@ void print_cuda_info()
 #endif
 		std::cout << "\t#" << i << " " << gpuname << " | SM version: " << version << " | SM count: " << smcount << std::endl;
 	}
+#endif
 }

 void print_opencl_info() {