chia-blockchain/lib/chiavdf/fast_vdf/threading.h

#include <boost/align/aligned_alloc.hpp>

//mp_limb_t is an unsigned integer
static_assert(sizeof(mp_limb_t)==8, "");

static_assert(sizeof(unsigned long int)==8, "");
static_assert(sizeof(long int)==8, "");

#ifdef ENABLE_TRACK_CYCLES
    const int track_cycles_array_size=track_cycles_max_num*track_cycles_num_buckets;

    thread_local int track_cycles_next_slot=0;
    thread_local array<uint64, track_cycles_array_size> track_cycles_cycle_counters;
    thread_local array<uint64, track_cycles_array_size> track_cycles_call_counters;
    thread_local array<const char*, track_cycles_max_num> track_cycles_names;

    void track_cycles_init() {
        thread_local bool is_init=false;
        if (!is_init) {
            //print( &track_cycles_names );

            //track_cycles_cycle_counters=new uint64[];
            //track_cycles_call_counters=new uint64[track_cycles_max_num*track_cycles_num_buckets];
            //track_cycles_names=new const char*[track_cycles_max_num];

            for (int x=0;x<track_cycles_array_size;++x) {
                track_cycles_cycle_counters.at(x)=0;
                track_cycles_call_counters.at(x)=0;
            }

            for (int x=0;x<track_cycles_max_num;++x) {
                track_cycles_names.at(x)=nullptr;
            }
            is_init=true;
        }
    }

    void track_cycles_output_stats() {
        track_cycles_init();

        //print( &track_cycles_names );

        for (int x=0;x<track_cycles_next_slot;++x) {
            double total_calls=0;
            for (int y=0;y<track_cycles_num_buckets;++y) {
                total_calls+=track_cycles_call_counters.at(x*track_cycles_num_buckets + y);
            }

            if (total_calls==0) {
                continue;
            }

            print( "" );
            print( track_cycles_names.at(x), ":" );

            for (int y=0;y<track_cycles_num_buckets;++y) {
                double cycles=track_cycles_cycle_counters.at(x*track_cycles_num_buckets + y);
                double calls=track_cycles_call_counters.at(x*track_cycles_num_buckets + y);

                if (calls==0) {
                    continue;
                }

                print(str( "#%: #", int(calls/total_calls*100), int(cycles/calls) ));
            }
        }
    }

    struct track_cycles_impl {
        int slot=-1;
        uint64 start_time=0;
        bool is_aborted=false;

        static uint64 get_time() {
            // Returns the time in EDX:EAX.
            uint64 high;
            uint64 low;
            asm volatile(
                "lfence\n\t"
                "sfence\n\t"
                "rdtsc\n\t"
                "sfence\n\t"
                "lfence\n\t"
            : "=a"(low), "=d"(high) :: "memory");

            return (high<<32) | low;
        }

        track_cycles_impl(int t_slot) {
            slot=t_slot;
            assert(slot>=0 && slot<track_cycles_max_num);

            start_time=get_time();
        }

        void abort() {
            is_aborted=true;
        }

        ~track_cycles_impl() {
            uint64 end_time=get_time();

            if (is_aborted) {
                return;
            }

            uint64 delta=end_time-start_time;

            if (delta==0) {
                return;
            }

            int num_bits=64-__builtin_clzll(delta);
            if (num_bits>=track_cycles_num_buckets) {
                return;
            }

            assert(num_bits>=0 && num_bits<track_cycles_num_buckets);
            assert(slot>=0 && slot<track_cycles_max_num);

            int index=slot*track_cycles_num_buckets + num_bits;
            assert(index>=0 && index<track_cycles_max_num*track_cycles_num_buckets);

            track_cycles_cycle_counters.at(index)+=delta;
            ++track_cycles_call_counters.at(index);
        }
    };

    #define TO_STRING_IMPL(x) #x

    #define TO_STRING(x) TO_STRING_IMPL(x)

    #define TRACK_CYCLES \
        track_cycles_init();\
        thread_local int track_cycles_c_slot=-1;\
        if (track_cycles_c_slot==-1) {\
            track_cycles_c_slot=track_cycles_next_slot;\
            ++track_cycles_next_slot;\
            \
            track_cycles_names.at(track_cycles_c_slot)=__FILE__ ":" TO_STRING(__LINE__);\
        }\
        track_cycles_impl c_track_cycles_impl(track_cycles_c_slot);
    //

    #define TRACK_CYCLES_ABORT c_track_cycles_impl.abort();

    #define TRACK_CYCLES_OUTPUT_STATS track_cycles_output_stats();
#else
    #define TRACK_CYCLES
    #define TRACK_CYCLES_ABORT
    #define TRACK_CYCLES_OUTPUT_STATS
#endif

//use realloc or free to free the memory
void* alloc_cache_line(size_t bytes) {
    //round up to the next multiple of 64
    size_t aligned_bytes=((bytes+63)>>6)<<6;

    void* res=boost::alignment::aligned_alloc(64, aligned_bytes); // aligned_alloc(64, aligned_bytes);
    assert((uint64(res)&63)==0); //must be aligned for correctness
    return res;
}

void* mp_alloc_func(size_t new_bytes) {
    void* res=alloc_cache_line(new_bytes);
    assert((uint64(res)&63)==0); //all memory used by gmp must be cache line aligned
    return res;
}

void mp_free_func(void* old_ptr, size_t old_bytes) {
    //either mp_alloc_func allocated old_ptr and it is 64-aligned, or it points to data in mpz and its address equals 16 modulo 64
    assert((uint64(old_ptr)&63)==0 || (uint64(old_ptr)&63)==16);

    if ((uint64(old_ptr)&63)==0) {
        //mp_alloc_func allocated this, so it can be freed with std::free
        boost::alignment::aligned_free(old_ptr); //free(old_ptr);
    } else {
        //this is part of the mpz struct defined below. it can't be freed, so do nothing
    }
}

void* mp_realloc_func(void* old_ptr, size_t old_bytes, size_t new_bytes) {
    void* res=mp_alloc_func(new_bytes);

    memcpy(res, old_ptr, (old_bytes<new_bytes)? old_bytes : new_bytes);

    mp_free_func(old_ptr, old_bytes);

    return res;
}

//must call this before calling any gmp functions
//(the mpz class constructor does not call any gmp functions)
void init_gmp() {
    mp_set_memory_functions(mp_alloc_func, mp_realloc_func, mp_free_func);
}

struct mpz_base {
    //16 bytes
    //int mpz._mp_alloc: number of limbs allocated
    //int mpz._mp_size: abs(_mp_size) is number of limbs in use; 0 if the integer is zero. it is negated if the integer is negative
    //mp_limb_t* mpz._mp_d: pointer to limbs
    //do not call mpz_swap on this. mpz_swap can be called on other gmp integers
    mpz_struct c_mpz;

    operator mpz_struct*() { return &c_mpz; }
    operator const mpz_struct*() const { return &c_mpz; }

    mpz_struct* _() { return &c_mpz; }
    const mpz_struct* _() const { return &c_mpz; }
};

//gmp can dynamically reallocate this
//the number of cache lines used is (padded_size+2)/8 rounded up
//1 cache line :  6 limbs
//2 cache lines: 14 limbs
//3 cache lines: 22 limbs
//4 cache lines: 30 limbs
//5 cache lines: 38 limbs
template<int d_expected_size, int d_padded_size> struct alignas(64) mpz : public mpz_base {
    static const int expected_size=d_expected_size;
    static const int padded_size=d_padded_size;

    static_assert(expected_size>=1 && expected_size<=padded_size, "");

    uint64 data[padded_size]; //must not be cache line aligned

    bool was_reallocated() const {
        return c_mpz._mp_d!=data;
    }

    //can't call any mpz functions here because it is global
    mpz() {
        c_mpz._mp_size=0;
        c_mpz._mp_d=(mp_limb_t *)data;
        c_mpz._mp_alloc=padded_size;

        //this is supposed to be cache line aligned so that the next assert works
        assert((uint64(this)&63)==0);

        //mp_free_func uses this to decide whether to free or not
        assert((uint64(c_mpz._mp_d)&63)==16);
    }

    ~mpz() {
        if (is_vdf_test) {
            //don't want this to happen for performance reasons
            assert(!was_reallocated());
        }

        //if c_mpz.data wasn't reallocated, it has to point to this instance's data and not some other instance's data
        //if mpz_swap was used, this might be violated
        assert((uint64(c_mpz._mp_d)&63)==0 || c_mpz._mp_d==data);
        mpz_clear(&c_mpz);
    }

    mpz(const mpz& t)=delete;
    mpz(mpz&& t)=delete;

    mpz& operator=(const mpz_struct* t) {
        mpz_set(*this, t);
        return *this;
    }

    mpz& operator=(const mpz& t) {
        mpz_set(*this, t);
        return *this;
    }

    mpz& operator=(mpz&& t) {
        mpz_set(*this, t); //do not use mpz_swap
        return *this;
    }

    /*mpz& operator=(const mpz_base& t) {
        mpz_set(*this, t);
        return *this;
    }

    mpz& operator=(mpz_base&& t) {
        mpz_set(*this, t); //do not use mpz_swap
        return *this;
    }*/

    mpz& operator=(uint64 i) {
        mpz_set_ui(*this, i);
        return *this;
    }

    mpz& operator=(int64 i) {
        mpz_set_si(*this, i);
        return *this;
    }

    mpz& operator=(const string& s) {
        int res=mpz_set_str(*this, s.c_str(), 0);
        assert(res==0);
        return *this;
    }

    USED string to_string() const {
        char* res_char=mpz_get_str(nullptr, 16, *this);
        string res_string = "0x";
        res_string+=res_char;

        if (res_string.substr(0, 3) == "0x-") {
            res_string.at(0)='-';
            res_string.at(1)='0';
            res_string.at(2)='x';
        }

        free(res_char);
        return res_string;
    }

    USED string to_string_dec() const {
        char* res_char=mpz_get_str(nullptr, 10, *this);
        string res_string=res_char;
        free(res_char);
        return res_string;
    }

    //sets *this to a+b
    void set_add(const mpz_struct* a, const mpz_struct* b) {
        mpz_add(*this, a, b);
    }

    void set_add(const mpz_struct* a, uint64 b) {
        mpz_add_ui(*this, a, b);
    }

    mpz& operator+=(const mpz_struct* t) {
        set_add(*this, t);
        return *this;
    }

    mpz& operator+=(uint64 t) {
        set_add(*this, t);
        return *this;
    }

    void set_sub(const mpz_struct* a, const mpz_struct* b) {
        mpz_sub(*this, a, b);
    }

    void set_sub(const mpz_struct* a, uint64 b) {
        mpz_sub_ui(*this, a, b);
    }

    template<class mpz_b> void set_sub(uint64 a, const mpz_b& b) {
        mpz_ui_sub(*this, a, b);
    }

    mpz& operator-=(const mpz_struct* t) {
        set_sub(*this, t);
        return *this;
    }

    void set_mul(const mpz_struct* a, const mpz_struct* b) {
        mpz_mul(*this, a, b);
    }

    void set_mul(const mpz_struct* a, int64 b) {
        mpz_mul_si(*this, a, b);
    }

    void set_mul(const mpz_struct* a, uint64 b) {
        mpz_mul_ui(*this, a, b);
    }

    mpz& operator*=(const mpz_struct* t) {
        set_mul(*this, t);
        return *this;
    }

    mpz& operator*=(int64 t) {
        set_mul(*this, t);
        return *this;
    }

    mpz& operator*=(uint64 t) {
        set_mul(*this, t);
        return *this;
    }

    void set_left_shift(const mpz_struct* a, int i) {
        assert(i>=0);
        mpz_mul_2exp(*this, a, i);
    }

    mpz& operator<<=(int i) {
        set_left_shift(*this, i);
        return *this;
    }

    //*this+=a*b
    void set_add_mul(const mpz_struct* a, const mpz_struct* b) {
        mpz_addmul(*this, a, b);
    }

    void set_add_mul(const mpz_struct* a, uint64 b) {
        mpz_addmul_ui(*this, a, b);
    }

    //*this-=a*b
    void set_sub_mul(const mpz_struct* a, const mpz_struct* b) {
        mpz_submul(*this, a, b);
    }

    void set_sub_mul(const mpz_struct* a, uint64 b) {
        mpz_submul_ui(*this, a, b);
    }

    void negate() {
        mpz_neg(*this, *this);
    }

    void abs() {
        mpz_abs(*this, *this);
    }

    void set_divide_floor(const mpz_struct* a, const mpz_struct* b) {
        if (mpz_sgn(b)==0) {
            assert(false);
            return;
        }

        mpz_fdiv_q(*this, a, b);
    }

    void set_divide_floor(const mpz_struct* a, const mpz_struct* b, mpz_struct* remainder) {
        if (mpz_sgn(b)==0) {
            assert(false);
            return;
        }

        mpz_fdiv_qr(*this, remainder, a, b);
    }

    void set_divide_exact(const mpz_struct* a, const mpz_struct* b) {
        if (mpz_sgn(b)==0) {
            assert(false);
            return;
        }

        mpz_divexact(*this, a, b);
    }

    void set_mod(const mpz_struct* a, const mpz_struct* b) {
        if (mpz_sgn(b)==0) {
            assert(false);
            return;
        }

        mpz_mod(*this, a, b);
    }

    mpz& operator%=(const mpz_struct* t) {
        set_mod(*this, t);
        return *this;
    }

    bool divisible_by(const mpz_struct* a) const {
        if (mpz_sgn(a)==0) {
            assert(false);
            return false;
        }

        return mpz_divisible_p(*this, a);
    }

    void set_right_shift(const mpz_struct* a, int i) {
        assert(i>=0);
        mpz_tdiv_q_2exp(*this, *this, i);
    }

    //note: this uses truncation rounding
    mpz& operator>>=(int i) {
        set_right_shift(*this, i);
        return *this;
    }

    bool operator<(const mpz_struct* t) const { return mpz_cmp(*this, t)<0; }
    bool operator<=(const mpz_struct* t) const { return mpz_cmp(*this, t)<=0; }
    bool operator==(const mpz_struct* t) const { return mpz_cmp(*this, t)==0; }
    bool operator>=(const mpz_struct* t) const { return mpz_cmp(*this, t)>=0; }
    bool operator>(const mpz_struct* t) const { return mpz_cmp(*this, t)>0; }
    bool operator!=(const mpz_struct* t) const { return mpz_cmp(*this, t)!=0; }

    bool operator<(int64 i) const { return mpz_cmp_si(*this, i)<0; }
    bool operator<=(int64 i) const { return mpz_cmp_si(*this, i)<=0; }
    bool operator==(int64 i) const { return mpz_cmp_si(*this, i)==0; }
    bool operator>=(int64 i) const { return mpz_cmp_si(*this, i)>=0; }
    bool operator>(int64 i) const { return mpz_cmp_si(*this, i)>0; }
    bool operator!=(int64 i) const { return mpz_cmp_si(*this, i)!=0; }

    bool operator<(uint64 i) const { return mpz_cmp_ui(_(), i)<0; }
    bool operator<=(uint64 i) const { return mpz_cmp_ui(_(), i)<=0; }
    bool operator==(uint64 i) const { return mpz_cmp_ui(_(), i)==0; }
    bool operator>=(uint64 i) const { return mpz_cmp_ui(_(), i)>=0; }
    bool operator>(uint64 i) const { return mpz_cmp_ui(_(), i)>0; }
    bool operator!=(uint64 i) const { return mpz_cmp_ui(_(), i)!=0; }

    int compare_abs(const mpz_struct* t) const {
        return mpz_cmpabs(*this, t);
    }

    int compare_abs(uint64 t) const {
        return mpz_cmpabs_ui(*this, t);
    }

    //returns 0 if *this==0
    int sgn() const {
        return mpz_sgn(_());
    }

    int num_bits() const {
        return mpz_sizeinbase(*this, 2);
    }

    //0 if this is 0
    int num_limbs() const {
        return mpz_size(*this);
    }

    const uint64* read_limbs() const {
        return (uint64*)mpz_limbs_read(*this);
    }

    //limbs are uninitialized. call finish
    uint64* write_limbs(int num) {
        return (uint64*)mpz_limbs_write(*this, num);
    }

    //limbs are zero padded to the specified size. call finish
    uint64* modify_limbs(int num) {
        int old_size=num_limbs();

        uint64* res=(uint64*)mpz_limbs_modify(*this, num);

        //gmp doesn't do this
        for (int x=old_size;x<num;++x) {
            res[x]=0;
        }

        return res;
    }

    //num is whatever was passed to write_limbs or modify_limbs
    //it can be less than that as long as it is at least the number of nonzero limbs
    //it can be 0 if the result is 0
    void finish(int num, bool negative=false) {
        mpz_limbs_finish(*this, (negative)? -num : num);
    }

    template<int size> array<uint64, size> to_array() const {
        assert(size>=num_limbs());

        array<uint64, size> res;
        for (int x=0;x<size;++x) {
            res[x]=0;
        }

        for (int x=0;x<num_limbs();++x) {
            res[x]=read_limbs()[x];
        }

        return res;
    }
};

template<class type> struct cache_line_ptr {
    type* ptr=nullptr;

    cache_line_ptr() {}
    cache_line_ptr(cache_line_ptr& t)=delete;
    cache_line_ptr(cache_line_ptr&& t) { swap(ptr, t.ptr); }

    cache_line_ptr& operator=(cache_line_ptr& t)=delete;
    cache_line_ptr& operator=(cache_line_ptr&& t) { swap(ptr, t.ptr); }

    ~cache_line_ptr() {
        if (ptr) {
            ptr->~type();
            boost::alignment::aligned_free(ptr); // wjb free(ptr);
            ptr=nullptr;
        }
    }

    type& operator*() const { return *ptr; }
    type* operator->() const { return ptr; }
};

template<class type, class... arg_types> cache_line_ptr<type> make_cache_line(arg_types&&... args) {
    cache_line_ptr<type> res;
    res.ptr=(type*)alloc_cache_line(sizeof(type));
    new(res.ptr) type(forward<arg_types>(args)...);
    return res;
}

template<bool is_write, class type> void prefetch(const type& p) {
    //write prefetching lowers performance but read prefetching increases it
    if (is_write) return;

    for (int x=0;x<sizeof(p);x+=64) {
        __builtin_prefetch(((char*)&p)+x, (is_write)? 1 : 0);
    }
}

template<class type> void prefetch_write(const type& p) { prefetch<true>(p); }
template<class type> void prefetch_read(const type& p) { prefetch<false>(p); }

void memory_barrier() {
    asm volatile( "" ::: "memory" );
}

struct alignas(64) thread_counter {
    uint64 counter_value=0; //updated atomically since only one thread can write to it
    uint64 error_flag=0;

    void reset() {
        memory_barrier();
        counter_value=0;
        error_flag=0;
        memory_barrier();
    }

    thread_counter() {
        assert((uint64(this)&63)==0);
    }
};

thread_counter master_counter[100];
thread_counter slave_counter[100];

struct thread_state {
    int pairindex;
    bool is_slave=false;
    uint64 counter_start=0;
    uint64 last_fence=0;

    void reset() {
        is_slave=false;
        counter_start=0;
        last_fence=0;
    }

    thread_counter& this_counter() {
        return (is_slave)? slave_counter[pairindex] : master_counter[pairindex];
    }

    thread_counter& other_counter() {
        return (is_slave)? master_counter[pairindex] : slave_counter[pairindex];
    }

    void raise_error() {
        //if (is_vdf_test) {
            //print( "raise_error", is_slave );
        //}

        memory_barrier();
        this_counter().error_flag=1;
        other_counter().error_flag=1;
        memory_barrier();
    }

    const uint64 v() {
        return this_counter().counter_value;
    }

    //waits for the other thread to have at least this counter value
    //returns false if an error has been raised
    bool fence_absolute(uint64 t_v) {
        if (last_fence>=t_v) {
            return true;
        }

        memory_barrier();

        uint64 spin_counter=0;
        while (other_counter().counter_value < t_v) {
            if (this_counter().error_flag || other_counter().error_flag) {
                raise_error();
                break;
            }

            if (spin_counter>max_spin_counter) {
                if (is_vdf_test) {
                    print( "spin_counter too high", is_slave );
                }

                raise_error();
                break;
            }

            ++spin_counter;
            memory_barrier();
        }

        memory_barrier();

        if (!(this_counter().error_flag)) {
            last_fence=t_v;
        }

        return !(this_counter().error_flag);
    }

    bool fence(int delta) {
        return fence_absolute(counter_start+uint64(delta));
    }

    //increases this thread's counter value. it can only be increased
    //returns false if an error has been raised
    bool advance_absolute(uint64 t_v) {
        if (t_v==v()) {
            return true;
        }

        memory_barrier(); //wait for all writes to finish (on x86 this doesn't do anything but the compiler still needs it)

        assert(t_v>=v());

        if (this_counter().error_flag) {
            raise_error();
        }

        this_counter().counter_value=t_v;

        memory_barrier(); //want the counter writes to be low latency so prevent the compiler from caching it
        return !(this_counter().error_flag);
    }

    bool advance(int delta) {
        return advance_absolute(counter_start+uint64(delta));
    }

    bool has_error() {
        return this_counter().error_flag;
    }

    /*void wait_for_error_to_be_cleared() {
        assert(is_slave && enable_threads);
        while (this_counter().error_flag) {
            memory_barrier();
        }
    }

    void clear_error() {
        assert(!is_slave);

        memory_barrier();
        this_counter().error_flag=0;
        other_counter().error_flag=0;
        memory_barrier();
    }*/
};

thread_local thread_state c_thread_state;

struct alignas(64) gcd_uv_entry {
    //these are uninitialized for the first entry
    uint64 u_0;
    uint64 u_1;
    uint64 v_0;
    uint64 v_1;
    uint64 parity; //1 if odd, 0 if even

    uint64 exit_flag; //1 if last, else 0

    uint64 unused_0;
    uint64 unused_1;

    template<class mpz_type> void matrix_multiply(const mpz_type& in_a, const mpz_type& in_b, mpz_type& out_a, mpz_type& out_b) const {
        out_a.set_mul((parity==0)? in_a : in_b, (parity==0)? u_0 : v_0);
        out_a.set_sub_mul((parity==0)? in_b : in_a, (parity==0)? v_0 : u_0);

        out_b.set_mul((parity==0)? in_b : in_a, (parity==0)? v_1 : u_1);
        out_b.set_sub_mul((parity==0)? in_a : in_b, (parity==0)? u_1 : v_1);
    }
};
static_assert(sizeof(gcd_uv_entry)==64, "");

template<class mpz_type> struct alignas(64) gcd_results_type {
    mpz_type as[2];
    mpz_type bs[2];

    static const int num_counter=gcd_max_iterations+1; //one per outputted entry

    array<gcd_uv_entry, gcd_max_iterations+1> uv_entries;

    int end_index=0;

    mpz_type& get_a_start() {
        return as[0];
    }

    mpz_type& get_b_start() {
        return bs[0];
    }

    mpz_type& get_a_end() {
        assert(end_index>=0 && end_index<2);
        return as[end_index];
    }

    mpz_type& get_b_end() {
        assert(end_index>=0 && end_index<2);
        return bs[end_index];
    }

    //this will increase the counter value and wait until the result at index is available
    //index 0 only has exit_flag initialized
    bool get_entry(int counter_start_delta, int index, const gcd_uv_entry** res) const {
        *res=nullptr;

        if (index>=gcd_max_iterations+1) {
            c_thread_state.raise_error();
            return false;
        }

        assert(index>=0);

        if (!c_thread_state.fence(counter_start_delta + index+1)) {
            return false;
        }

        *res=&uv_entries[index];
        return true;
    }
};

//a and b in c_results should be initialized
//returns false if the gcd failed
//this assumes that all inputs are unsigned, a>=b, and a>=threshold
//this will increase the counter value as results are generated
template<class mpz_type> bool gcd_unsigned(
    int counter_start_delta, gcd_results_type<mpz_type>& c_results, const array<uint64, gcd_size>& threshold
) {
    if (c_thread_state.has_error()) {
        return false;
    }

    int a_limbs=c_results.get_a_start().num_limbs();
    int b_limbs=c_results.get_b_start().num_limbs();

    if (a_limbs>gcd_size || b_limbs>gcd_size) {
        c_thread_state.raise_error();
        return false;
    }

    asm_code::asm_func_gcd_unsigned_data data;
    data.a=c_results.as[0].modify_limbs(gcd_size);
    data.b=c_results.bs[0].modify_limbs(gcd_size);
    data.a_2=c_results.as[1].write_limbs(gcd_size);
    data.b_2=c_results.bs[1].write_limbs(gcd_size);
    data.threshold=(uint64*)&threshold[0];

    data.uv_counter_start=c_thread_state.counter_start+counter_start_delta+1;
    data.out_uv_counter_addr=&(c_thread_state.this_counter().counter_value);
    data.out_uv_addr=(uint64*)&(c_results.uv_entries[1]);
    data.iter=-1;
    data.a_end_index=(a_limbs==0)? 0 : a_limbs-1;

    if (is_vdf_test) {
        assert((uint64(data.out_uv_addr)&63)==0); //should be cache line aligned
    }

    memory_barrier();
    int error_code=asm_code::asm_func_gcd_unsigned(&data);
    memory_barrier();

    if (error_code!=0) {
        c_thread_state.raise_error();
        return false;
    }

    assert(data.iter>=0 && data.iter<=gcd_max_iterations); //total number of iterations performed
    bool is_even=((data.iter-1)&1)==0; //parity of last iteration (can be -1)

    c_results.end_index=(is_even)? 1 : 0;

    c_results.as[0].finish(gcd_size);
    c_results.as[1].finish(gcd_size);
    c_results.bs[0].finish(gcd_size);
    c_results.bs[1].finish(gcd_size);

    inject_error(c_results.as[0]);
    inject_error(c_results.as[1]);
    inject_error(c_results.bs[0]);
    inject_error(c_results.bs[1]);

    if (!c_thread_state.advance(counter_start_delta+gcd_results_type<mpz_type>::num_counter)) {
        return false;
    }

    return true;
}