chia-blockchain/lib/chiavdf/fast_vdf/asm_gcd_128.h

namespace asm_code {


typedef array<reg_scalar, 2> reg_scalar_128;

//v[0] is low, v[1] is high. amount is >=0 and <128. res can't alias with v
//preserves inputs. returns low part of result
//regs: RCX, 1x scalar
void shift_right(
    reg_alloc regs, array<reg_scalar, 2> v, reg_scalar amount, reg_scalar res,
    reg_scalar tmp_rcx, reg_scalar tmp_res_2
) {
    EXPAND_MACROS_SCOPE;

    m.bind(v, "v");
    m.bind(amount, "amount");
    m.bind(res, "res");

    assert(tmp_rcx.value==reg_rcx.value);
    m.bind(tmp_res_2, "res_2");

    //res=uint64([v[1]:v[0]] >> amount) ; undefined if amount>=64
    APPEND_M(str( "MOV RCX, `amount" ));
    APPEND_M(str( "MOV `res, `v_0" ));
    APPEND_M(str( "SHRD `res, `v_1, CL" ));

    //res_2=0
    APPEND_M(str( "XOR `res_2, `res_2" ));

    //RCX=amount-64
    APPEND_M(str( "SUB RCX, 64" ));

    //res=(amount>=64)? 0 : res
    //res_2=(amount>=64)? v[1] : 0
    APPEND_M(str( "CMOVAE `res, `res_2" ));
    APPEND_M(str( "CMOVAE `res_2, `v_1" ));

    //res_2=(amount>=64)? 0 : v[1]>>(amount-64)
    APPEND_M(str( "SHR `res_2, CL" ));

    //res=(amount>=64)? res_2 : res
    APPEND_M(str( "OR `res, `res_2" ));
}

//all inputs are unsigned
void dot_product_exact(reg_alloc regs, array<reg_scalar, 2> a, array<reg_scalar, 2> b, reg_scalar out, string overflow_label) {
    EXPAND_MACROS_SCOPE;
    m.bind(a, "a");
    m.bind(b, "b");
    m.bind(out, "out");

    reg_scalar rax=regs.bind_scalar(m, "rax", reg_rax);
    reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);

    //out=a0*b0
    APPEND_M(str( "MOV RAX, `a_0" ));
    APPEND_M(str( "MUL `b_0" ));
    APPEND_M(str( "JC #", overflow_label ));
    APPEND_M(str( "MOV `out, RAX" ));

    //RAX=a1*b1
    APPEND_M(str( "MOV RAX, `a_1" ));
    APPEND_M(str( "MUL `b_1" ));
    APPEND_M(str( "JC #", overflow_label ));

    //out=a0*b0+a1*b1
    APPEND_M(str( "ADD `out, RAX" ));
    APPEND_M(str( "JC #", overflow_label ));
}

//ab and ab_threshold reg_spill are 16 bytes (lsb first), 8 byte aligned. all others are 8 bytes
//parity is 1 if odd, else 0
//is_lehmer is 1 if true, else 0
//u, v, and parity are outputs
//regs: 15x scalar, 16x vector (i.e. all of the registers except RSP)
void gcd_128(
    reg_alloc regs_parent,
    array<reg_spill, 2> spill_ab_start, array<reg_spill, 2> spill_u, array<reg_spill, 2> spill_v,
    reg_spill spill_parity, reg_spill spill_is_lehmer, reg_spill spill_ab_threshold,
    string no_progress_label
) {
    EXPAND_MACROS_SCOPE_PUBLIC;

    track_asm( "gcd_128" );

    m.bind(spill_ab_start[0], "spill_ab_start_0_0");
    m.bind(spill_ab_start[0]+8, "spill_ab_start_0_1");
    m.bind(spill_ab_start[1], "spill_ab_start_1_0");
    m.bind(spill_ab_start[1]+8, "spill_ab_start_1_1");

    m.bind(spill_u, "spill_u");
    m.bind(spill_v, "spill_v");

    m.bind(spill_parity, "spill_parity");
    m.bind(spill_is_lehmer, "spill_is_lehmer");
    m.bind(spill_ab_threshold, "spill_ab_threshold_0");
    m.bind(spill_ab_threshold+8, "spill_ab_threshold_1");

    reg_vector vector_ab=regs_parent.bind_vector(m, "vector_ab");
    reg_vector vector_u=regs_parent.bind_vector(m, "vector_u");
    reg_vector vector_v=regs_parent.bind_vector(m, "vector_v");
    reg_vector vector_is_lehmer=regs_parent.bind_vector(m, "vector_is_lehmer");
    reg_vector vector_ab_threshold=regs_parent.bind_vector(m, "vector_ab_threshold");

    reg_spill spill_iter=regs_parent.bind_spill(m, "spill_iter");

    APPEND_M(str( "MOV QWORD PTR `spill_u_0, 1" ));
    APPEND_M(str( "MOV QWORD PTR `spill_u_1, 0" ));
    APPEND_M(str( "MOV QWORD PTR `spill_v_0, 0" ));
    APPEND_M(str( "MOV QWORD PTR `spill_v_1, 1" ));
    APPEND_M(str( "MOV QWORD PTR `spill_parity, 0" ));
    APPEND_M(str( "MOV QWORD PTR `spill_iter, #", to_hex(gcd_128_max_iter) ));

    string start_label=m.alloc_label();
    string loop_label=m.alloc_label();
    string exit_label=m.alloc_label();
    string exit_iter_0_label=m.alloc_label();

    string start_assign_label=m.alloc_label();
    APPEND_M(str( "JMP #", start_assign_label ));

    APPEND_M(str( "#:", loop_label ));

    track_asm( "gcd_128 iter" );

    //4x scalar
    reg_scalar new_u_0=regs_parent.bind_scalar(m, "new_u_0"); //a
    reg_scalar new_u_1=regs_parent.bind_scalar(m, "new_u_1"); //b
    reg_scalar new_v_0=regs_parent.bind_scalar(m, "new_v_0"); //ab_threshold
    reg_scalar new_v_1=regs_parent.bind_scalar(m, "new_v_1"); //base iter

    if (use_divide_table) {
        string base_exit_label=m.alloc_label();
        string base_loop_label=m.alloc_label();

        APPEND_M(str( "MOV `new_v_1, #", to_hex(gcd_base_max_iter_divide_table) ));

        APPEND_M(str( "MOVDQA `vector_u, #", constant_address_uint64(1ull, 0ull) ));
        APPEND_M(str( "MOVDQA `vector_v, #", constant_address_uint64(0ull, 1ull) ));

        APPEND_M(str( "#:", base_loop_label ));

        gcd_64_iteration(regs_parent, vector_is_lehmer, {new_u_0, new_u_1}, {vector_u, vector_v}, new_v_0, base_exit_label);

        APPEND_M(str( "DEC `new_v_1" ));
        APPEND_M(str( "JNZ #", base_loop_label ));

        APPEND_M(str( "#:", base_exit_label ));
        APPEND_M(str( "CMP `new_v_1, #", to_hex(gcd_base_max_iter_divide_table) ));
        APPEND_M(str( "JE #", track_asm( "gcd_128 base no progress", exit_label ) ));
    } else {
        gcd_base_continued_fraction(
            regs_parent, vector_ab, vector_u, vector_v, vector_is_lehmer, vector_ab_threshold,
            track_asm( "gcd_128 base no progress", exit_label )
        );
    }

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //12x scalar (including dot product exact which is 2x scalar)
        reg_scalar m_0_0=regs.bind_scalar(m, "m_0_0");
        reg_scalar m_0_1=regs.bind_scalar(m, "m_0_1");
        reg_scalar m_1_0=regs.bind_scalar(m, "m_1_0");
        reg_scalar m_1_1=regs.bind_scalar(m, "m_1_1");
        reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0");
        reg_scalar tmp_1=regs.bind_scalar(m, "tmp_1");
        reg_vector tmp_a=regs.bind_vector(m, "tmp_a");
        reg_vector tmp_b=regs.bind_vector(m, "tmp_b");
        reg_vector tmp_c=regs.bind_vector(m, "tmp_c");
        reg_vector c_double_abs_mask=regs.bind_vector(m, "double_abs_mask");

        if (!use_divide_table) {
            APPEND_M(str( "MOVAPD `double_abs_mask, #", constant_address_uint64(double_abs_mask, double_abs_mask) ));
        }

        auto abs_tmp_a=[&]() {
            if (use_divide_table) {
                //tmp_b = int64 mask = int64(v)>>63;
                APPEND_M(str( "MOVDQA `tmp_b, `tmp_a" ));
                APPEND_M(str( "PSRAD `tmp_b, 32" )); //high 32 bits = sign bit ; low 32 bits = undefined
                APPEND_M(str( "PSHUFD `tmp_b, `tmp_b, #", to_hex( 0b11110101 ) )); //move high 32 bits to low 32 bits

                //abs_v=(v + mask) ^ mask;
                APPEND_M(str( "PADDQ `tmp_a, `tmp_b" ));
                APPEND_M(str( "PXOR `tmp_a, `tmp_b" ));
            } else {
                APPEND_M(str( "PAND `tmp_a, `double_abs_mask" ));
            }
        };

        auto mov_low_tmp_a=[&](string target) {
            if (use_divide_table) {
                APPEND_M(str( "MOVQ `#, `tmp_a", target ));
            } else {
                APPEND_M(str( "CVTTSD2SI `#, `tmp_a", target ));
            }
        };

        //<m_0_0, m_1_0>=<abs(vector_u[0]), abs(vector_u[1])>
        //for the divide table, this is u[0] and v[0]
        APPEND_M(str( "MOVAPD `tmp_a, `vector_u" ));
        abs_tmp_a();
        mov_low_tmp_a( (use_divide_table)? "m_0_0" : "m_0_0" );
        APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
        mov_low_tmp_a( (use_divide_table)? "m_0_1" : "m_1_0" );

        //<m_1_0, m_1_1>=<abs(vector_v[0]), abs(vector_v[1])>
        //for the divide table, this is u[1] and v[1]
        APPEND_M(str( "MOVAPD `tmp_a, `vector_v" ));
        abs_tmp_a();
        mov_low_tmp_a( (use_divide_table)? "m_1_0" : "m_0_1" );
        APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
        mov_low_tmp_a( (use_divide_table)? "m_1_1" : "m_1_1" );

        APPEND_M(str( "MOV `tmp_0, `spill_u_0" ));
        APPEND_M(str( "MOV `tmp_1, `spill_u_1" ));
        dot_product_exact(regs, {m_0_0, m_0_1}, {tmp_0, tmp_1}, new_u_0, track_asm( "gcd_128 uv overflow", exit_label ));
        dot_product_exact(regs, {m_1_0, m_1_1}, {tmp_0, tmp_1}, new_u_1, track_asm( "gcd_128 uv overflow", exit_label ));

        APPEND_M(str( "MOV `tmp_0, `spill_v_0" ));
        APPEND_M(str( "MOV `tmp_1, `spill_v_1" ));
        dot_product_exact(regs, {m_0_0, m_0_1}, {tmp_0, tmp_1}, new_v_0, track_asm( "gcd_128 uv overflow", exit_label ));
        dot_product_exact(regs, {m_1_0, m_1_1}, {tmp_0, tmp_1}, new_v_1, track_asm( "gcd_128 uv overflow", exit_label ));
    }

    //9x scalar
    reg_scalar new_ab_0_0=regs_parent.bind_scalar(m, "new_ab_0_0");
    reg_scalar new_ab_0_1=regs_parent.bind_scalar(m, "new_ab_0_1");
    reg_scalar new_ab_1_0=regs_parent.bind_scalar(m, "new_ab_1_0");
    reg_scalar new_ab_1_1=regs_parent.bind_scalar(m, "new_ab_1_1");
    reg_scalar new_parity=regs_parent.bind_scalar(m, "new_parity");

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //15x scalar
        reg_scalar rax=regs.bind_scalar(m, "rax", reg_rax);
        reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);
        reg_vector tmp_a=regs.bind_vector(m, "tmp_a");

        reg_scalar ab_start_0_0=regs.bind_scalar(m, "ab_start_0_0");
        reg_scalar ab_start_0_1=regs.bind_scalar(m, "ab_start_0_1");
        reg_scalar ab_start_1_0=regs.bind_scalar(m, "ab_start_1_0");
        reg_scalar ab_start_1_1=regs.bind_scalar(m, "ab_start_1_1");

        APPEND_M(str( "MOV `ab_start_0_0, `spill_ab_start_0_0" ));
        APPEND_M(str( "MOV `ab_start_0_1, `spill_ab_start_0_1" ));
        APPEND_M(str( "MOV `ab_start_1_0, `spill_ab_start_1_0" ));
        APPEND_M(str( "MOV `ab_start_1_1, `spill_ab_start_1_1" ));

        //RAX=(uv_double[1][1]<0)? 1 : 0=uv_double_parity
        //(this also works for integers with the divide table)
        APPEND_M(str( "MOVAPD `tmp_a, `vector_v" ));
        APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
        APPEND_M(str( "MOVQ RAX, `tmp_a" ));
        APPEND_M(str( "SHR RAX, 63" ));

        //new_parity=spill_parity^uv_double_parity
        APPEND_M(str( "MOV `new_parity, `spill_parity" ));
        APPEND_M(str( "XOR `new_parity, RAX" ));

        //[out1:out0]=[a1:a0]*u - [b1:b0]*v
        auto dot_product_subtract=[&](string a0, string a1, string b0, string b1, string u, string v, string out0, string out1) {
            //[RDX:RAX]=a0*u
            APPEND_M(str( "MOV RAX, `#", a0 ));
            APPEND_M(str( "MUL `#", u ));

            //[out1:out0]=a0*u
            APPEND_M(str( "MOV `#, RAX", out0 ));
            APPEND_M(str( "MOV `#, RDX", out1 ));

            //[RDX:RAX]=a1*u
            APPEND_M(str( "MOV RAX, `#", a1 ));
            APPEND_M(str( "MUL `#", u ));

            //[out1:out0]=a0*u + (a1*u)<<64=a*u
            APPEND_M(str( "ADD `#, RAX", out1 ));

            //[RDX:RAX]=b0*v
            APPEND_M(str( "MOV RAX, `#", b0 ));
            APPEND_M(str( "MUL `#", v ));

            //[out1:out0]=a*u - b0*v
            APPEND_M(str( "SUB `#, RAX", out0 ));
            APPEND_M(str( "SBB `#, RDX", out1 ));

            //[RDX:RAX]=b1*v
            APPEND_M(str( "MOV RAX, `#", b1 ));
            APPEND_M(str( "MUL `#", v ));

            //[out1:out0]=a*u - b0*v - (b1*v)<<64=a*u - b*v
            APPEND_M(str( "SUB `#, RAX", out1 ));
        };

        // uint64 uv_00=uv_uint64_new[0][0];
        // uint64 uv_01=uv_uint64_new[0][1];
        // int128 a_new_1=ab_start[0]; a_new_1*=uv_00;
        // int128 a_new_2=ab_start[1]; a_new_2*=uv_01;
        // if (uv_uint64_parity_new!=0) swap(a_new_1, a_new_2);
        // int128 a_new_s=a_new_1-a_new_2;
        // uint128 a_new(a_new_s);
        dot_product_subtract(
            "ab_start_0_0", "ab_start_0_1",
            "ab_start_1_0", "ab_start_1_1",
            "new_u_0", "new_v_0",
            "new_ab_0_0", "new_ab_0_1"
        );

        // uint64 uv_10=uv_uint64_new[1][0];
        // uint64 uv_11=uv_uint64_new[1][1];
        // int128 b_new_1=ab_start[1]; b_new_1*=uv_11;
        // int128 b_new_2=ab_start[0]; b_new_2*=uv_10;
        // if (uv_uint64_parity_new!=0) swap(b_new_1, b_new_2);
        // int128 b_new_s=b_new_1-b_new_2;
        // uint128 b_new(b_new_s);
        dot_product_subtract(
            "ab_start_1_0", "ab_start_1_1",
            "ab_start_0_0", "ab_start_0_1",
            "new_v_1", "new_u_1",
            "new_ab_1_0", "new_ab_1_1"
        );

        APPEND_M(str( "MOV RAX, -1" ));
        APPEND_M(str( "ADD RAX, `new_parity" )); //rax=(new_parity==1)? 0 : ~0
        APPEND_M(str( "NOT RAX" )); //rax=(new_parity==1)? ~0 : 0

        //if (new_parity!=0) { [out1:out0]=-[out1:out0]; }
        auto conditional_negate=[&](string out0, string out1) {
            //flip all bits if new_parity==1
            APPEND_M(str( "XOR `#, RAX", out0 ));
            APPEND_M(str( "XOR `#, RAX", out1 ));

            //add 1 if new_parity==1
            APPEND_M(str( "ADD `#, `new_parity", out0 ));
            APPEND_M(str( "ADC `#, 0", out1 ));
        };

        conditional_negate( "new_ab_0_0", "new_ab_0_1" );
        conditional_negate( "new_ab_1_0", "new_ab_1_1" );
    }

    //11x scalar: new_ab, new_u, new_v, new_parity, ab_threshold
    reg_scalar ab_threshold_0=regs_parent.bind_scalar(m, "ab_threshold_0");
    reg_scalar ab_threshold_1=regs_parent.bind_scalar(m, "ab_threshold_1");

    //flags for [a1:a0]-[b1:b0]:
    //CMP a0,b0 ; sets CF if b0>a0. clears CF if b0==a0
    //SBB a1,b1 ; sets CF if b>a. sets ZF if b==a. may set ZF if b<a (e.g. a1==0; b1==0; b0<a0)
    //CF set: a<b
    //CF cleared: a>=b
    //need to swap the order for <= and >

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //15x scalar
        reg_scalar ab_delta_0=regs.bind_scalar(m, "ab_delta_0");
        reg_scalar ab_delta_1=regs.bind_scalar(m, "ab_delta_1");
        reg_scalar b_new_min=regs.bind_scalar(m, "b_new_min");
        reg_scalar is_lehmer=regs.bind_scalar(m, "is_lehmer");

        APPEND_M(str( "MOV `is_lehmer, `spill_is_lehmer" ));

        //uint128 ab_delta=new_ab[0]-new_ab[1]
        APPEND_M(str( "MOV `ab_delta_0, `new_ab_0_0" ));
        APPEND_M(str( "MOV `ab_delta_1, `new_ab_0_1" ));
        APPEND_M(str( "SUB `ab_delta_0, `new_ab_1_0" ));
        APPEND_M(str( "SBB `ab_delta_1, `new_ab_1_1" ));

        // assert(a_new>=b_new);
        // uint128 ab_delta=a_new-b_new;
        //
        // even:
        // +uv_00 -uv_01
        // -uv_10 +uv_11
        //
        // uint128 v_delta=uint128(v_1)+uint128(v_0); //even: positive. odd: negative
        // uint128 u_delta=uint128(u_1)+uint128(u_0); //even: negative. odd: positive
        //
        // uv_10 is negative if even, positive if odd
        // uv_11 is positive if even, negative if odd
        // bool passed_even=(b_new>=uint128(u_1) && ab_delta>=v_delta);
        // bool passed_odd=(b_new>=uint128(v_1) && ab_delta>=u_delta);

        //uint64 uv_delta_0=(even)? new_v_1 : new_u_1;
        //uv_delta_0 stored in ab_threshold_0
        APPEND_M(str( "CMP `new_parity, 0" ));
        APPEND_M(str( "MOV `ab_threshold_0, `new_u_1" ));
        APPEND_M(str( "CMOVE `ab_threshold_0, `new_v_1" ));

        //uint64 uv_delta_1=(even)? new_v_0 : new_u_0;
        //uv_delta_1 stored in ab_threshold_1
        APPEND_M(str( "MOV `ab_threshold_1, `new_u_0" ));
        APPEND_M(str( "CMOVE `ab_threshold_1, `new_v_0" ));

        //uint64 b_new_min=(even)? new_u_1 : new_v_1;
        APPEND_M(str( "MOV `b_new_min, `new_v_1" ));
        APPEND_M(str( "CMOVE `b_new_min, `new_u_1" ));

        //if (!is_lehmer) uv_delta=0
        APPEND_M(str( "CMP `is_lehmer, 0" ));
        APPEND_M(str( "CMOVE `ab_threshold_0, `is_lehmer" )); //if moved, is_lehmer==0
        APPEND_M(str( "CMOVE `ab_threshold_1, `is_lehmer" ));

        //if (!is_lehmer) b_new_min=0
        APPEND_M(str( "CMOVE `b_new_min, `is_lehmer" ));

        //[uv_delta_1:uv_delta_0]=uv_delta_0 + uv_delta_1 //v_delta if even, else u_delta
        APPEND_M(str( "ADD `ab_threshold_0, `ab_threshold_1" ));
        APPEND_M(str( "MOV `ab_threshold_1, 0" ));
        APPEND_M(str( "ADC `ab_threshold_1, 0" ));

        //if (ab_delta<uv_delta) goto exit
        //clobbers ab_delta
        //uv_delta (ab_threshold) not needed anymore
        APPEND_M(str( "SUB `ab_delta_0, `ab_threshold_0" ));
        APPEND_M(str( "SBB `ab_delta_1, `ab_threshold_1" ));
        APPEND_M(str( "JC #", track_asm( "gcd_128 lehmer fail ab_delta<uv_delta", exit_label ) ));

        //if (new_ab[1]<b_new_min) goto exit
        //clobbers b_new_min
        APPEND_M(str( "CMP `new_ab_1_0, `b_new_min" ));
        APPEND_M(str( "MOV `b_new_min, `new_ab_1_1" ));
        APPEND_M(str( "SBB `b_new_min, 0" ));
        APPEND_M(str( "JC #", track_asm( "gcd_128 lehmer fail new_ab[1]<b_new_min", exit_label ) ));

        //
        //

        APPEND_M(str( "MOV `ab_threshold_0, `spill_ab_threshold_0" ));
        APPEND_M(str( "MOV `ab_threshold_1, `spill_ab_threshold_1" ));

        //if (ab_threshold>=new_ab[0]) goto exit;
        APPEND_M(str( "MOV `ab_delta_0, `ab_threshold_0" ));
        APPEND_M(str( "MOV `ab_delta_1, `ab_threshold_1" ));
        APPEND_M(str( "SUB `ab_delta_0, `new_ab_0_0" ));
        APPEND_M(str( "SBB `ab_delta_1, `new_ab_0_1" ));
        APPEND_M(str( "JNC #", track_asm( "gcd_128 went too far ab_threshold>=new_ab[0]", exit_label ) ));

        //u=new_u;
        APPEND_M(str( "MOV `spill_u_0, `new_u_0" ));
        APPEND_M(str( "MOV `spill_u_1, `new_u_1" ));

        //v=new_v;
        APPEND_M(str( "MOV `spill_v_0, `new_v_0" ));
        APPEND_M(str( "MOV `spill_v_1, `new_v_1" ));

        //parity=new_parity;
        APPEND_M(str( "MOV `spill_parity, `new_parity" ));

        track_asm( "gcd_128 good iter" );

        //--iter;
        //if (iter==0) goto exit;
        APPEND_M(str( "MOV `ab_delta_0, `spill_iter" ));
        APPEND_M(str( "DEC `ab_delta_0" ));
        APPEND_M(str( "MOV `spill_iter, `ab_delta_0" ));
        APPEND_M(str( "JZ #", track_asm( "gcd_128 good exit", exit_iter_0_label ) ));
    }

    APPEND_M(str( "#:", start_label ));
    //11x scalar: new_ab, new_u, new_v, new_parity, ab_threshold

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //4x scalar
        reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0", reg_rax);
        reg_scalar tmp_1=regs.bind_scalar(m, "tmp_1", reg_rdx);
        reg_scalar tmp_2=regs.bind_scalar(m, "tmp_2");
        reg_scalar tmp_3=regs.bind_scalar(m, "tmp_3", reg_rcx);

        reg_scalar ab_0_0=new_ab_0_0;
        reg_scalar ab_0_1=new_ab_0_1;
        reg_scalar ab_1_0=new_ab_1_0;
        reg_scalar ab_1_1=new_ab_1_1;

        m.bind(new_ab_0_0, "ab_0_0");
        m.bind(new_ab_0_1, "ab_0_1");
        m.bind(new_ab_1_0, "ab_1_0");
        m.bind(new_ab_1_1, "ab_1_1");

        m.bind(ab_threshold_0, "ab_threshold_0");
        m.bind(ab_threshold_1, "ab_threshold_1");

        //tmp_3=0
        APPEND_M(str( "XOR `tmp_3, `tmp_3" ));

        //tmp=ab_1-ab_threshold
        APPEND_M(str( "MOV `tmp_0, `ab_1_0" ));
        APPEND_M(str( "MOV `tmp_1, `ab_1_1" ));
        APPEND_M(str( "SUB `tmp_0, `ab_threshold_0" ));
        APPEND_M(str( "SBB `tmp_1, `ab_threshold_1" ));

        //if (ab[1]<ab_threshold) goto exit
        APPEND_M(str( "JC #", track_asm( "gcd_128 ab[1]<ab_threshold", exit_label ) ));

        //if (ab[1]==ab_threshold) goto exit
        APPEND_M(str( "MOV `tmp_2, `tmp_0" ));
        APPEND_M(str( "OR `tmp_2, `tmp_1" )); //ZF set if tmp_0==0 and tmp_1==0
        APPEND_M(str( "JZ #", track_asm( "gcd_128 ab[1]==ab_threshold", exit_label ) ));

        //tmp_0=(ab[0][1]==0)? ab[0][0] : ab[0][1]
        //tmp_1=(ab[0][1]==0)? 0 : 64
        //tmp_0 can't be 0
        APPEND_M(str( "MOV `tmp_0, `ab_0_1" ));
        APPEND_M(str( "MOV `tmp_1, 64" ));
        APPEND_M(str( "CMP `ab_0_1, 0" ));
#ifdef CHIAOSX
        string cmoveq_label1=m.alloc_label();
        APPEND_M(str( "JNE #", cmoveq_label1));
        APPEND_M(str( "MOV `tmp_0, `ab_0_0" ));
        APPEND_M(str("#:", cmoveq_label1));

        string cmoveq_label2=m.alloc_label();
        APPEND_M(str( "JNE #", cmoveq_label2));
        APPEND_M(str( "MOV `tmp_1, `tmp_3" ));
        APPEND_M(str("#:", cmoveq_label2));
#else
        APPEND_M(str( "CMOVEQ `tmp_0, `ab_0_0" ));
        APPEND_M(str( "CMOVEQ `tmp_1, `tmp_3" ));
#endif

        //tmp_0=[first set bit index in tmp_0]
        APPEND_M(str( "BSR `tmp_0, `tmp_0" ));

        //tmp_0=[number of bits in ab[0]]=a_num_bits
        APPEND_M(str( "ADD `tmp_1, `tmp_0" ));
        APPEND_M(str( "INC `tmp_1" ));

        //if (is_lehmer) {
        //    const int min_bits=96;
        //    if (a_num_bits<min_bits) {
        //        a_num_bits=min_bits;
        //    }
        //}

        //tmp_2=spill_is_lehmer
        //tmp_0=((spill_is_lehmer)? 96 : 0)=min_bits
        APPEND_M(str( "XOR `tmp_0, `tmp_0" ));
        APPEND_M(str( "MOV `tmp_2, `spill_is_lehmer" ));
        APPEND_M(str( "CMP `tmp_2, 0" ));
        APPEND_M(str( "MOV `tmp_3, 96" ));
        APPEND_M(str( "CMOVNE `tmp_0, `tmp_3" ));
        APPEND_M(str( "XOR `tmp_3, `tmp_3" ));

        //if (a_num_bits<min_bits) a_num_bits=min_bits;
        APPEND_M(str( "CMP `tmp_1, `tmp_0" ));
        APPEND_M(str( "CMOVB `tmp_1, `tmp_0" ));

        //int shift_amount=a_num_bits-gcd_base_bits; [shift amount can't exceed 128-gcd_base_bits]
        //if (shift_amount<0) {
        //    shift_amount=0;
        //}

        //tmp_1=a_num_bits-gcd_base_bits
        APPEND_M(str( "SUB `tmp_1, #", to_hex(gcd_base_bits) ));

        //if (a_num_bits<gcd_base_bits) tmp_1=0
        //tmp_1=shift_amount
        APPEND_M(str( "CMOVB `tmp_1, `tmp_3" ));

        //vector_is_lehmer=((spill_is_lehmer | shift_amount)!=0)? <~0, ~0> : <0, 0>
        APPEND_M(str( "OR `tmp_2, `tmp_1" ));
        if (!use_divide_table) {
#ifdef CHIAOSX
            APPEND_M(str( "LEA `tmp_3, [RIP+#]", constant_address_uint64(0ull, 0ull, false) ));
            APPEND_M(str( "LEA `tmp_0, [RIP+#]", constant_address_uint64(~(0ull), ~(0ull), false) ));
#else
            APPEND_M(str( "MOV `tmp_3, OFFSET FLAT:#", constant_address_uint64(0ull, 0ull, false) ));
            APPEND_M(str( "MOV `tmp_0, OFFSET FLAT:#", constant_address_uint64(~(0ull), ~(0ull), false) ));
#endif
        } else {
#ifdef CHIAOSX
            APPEND_M(str( "LEA `tmp_3, [RIP+#]", constant_address_uint64(gcd_mask_exact[0], gcd_mask_exact[1], false) ));
            APPEND_M(str( "LEA `tmp_0, [RIP+#]", constant_address_uint64(gcd_mask_approximate[0], gcd_mask_approximate[1], false) ));
#else
            APPEND_M(str( "MOV `tmp_3, OFFSET FLAT:#", constant_address_uint64(gcd_mask_exact[0], gcd_mask_exact[1], false) ));
            APPEND_M(str( "MOV `tmp_0, OFFSET FLAT:#", constant_address_uint64(gcd_mask_approximate[0], gcd_mask_approximate[1], false) ));
#endif
        }
        APPEND_M(str( "CMOVZ `tmp_0, `tmp_3" ));
        APPEND_M(str( "MOVAPD `vector_is_lehmer, [`tmp_0]" ));

        //vector2 ab_double{
        //    double(uint64(ab[0]>>shift_amount)),
        //    double(uint64(ab[1]>>shift_amount))
        //};
        //double ab_threshold_double(uint64(ab_threshold>>shift_amount));
        //if (shift_amount!=0) {
        //    ++ab_threshold_double; [can do this with integers because the shifted ab_threshold has to fit in a double exactly]
        //    a is larger than ab_threshold
        //}

        //vector_ab=<ab_1>>shift_amount, undefined>
        //also store integer in new_u_1
        shift_right(regs, {ab_1_0, ab_1_1}, tmp_1, new_u_1, tmp_3, tmp_2);
        if (!use_divide_table) {
            APPEND_M(str( "CVTSI2SD `vector_ab, `new_u_1" ));
        }

        //vector_ab=<ab_1>>shift_amount, ab_1>>shift_amount>
        if (!use_divide_table) {
            APPEND_M(str( "SHUFPD `vector_ab, `vector_ab, 0" ));
        }

        //vector_ab=<ab_0>>shift_amount, ab_1>>shift_amount>
        //also store integer in new_u_1
        shift_right(regs, {ab_0_0, ab_0_1}, tmp_1, new_u_0, tmp_3, tmp_2);
        if (!use_divide_table) {
            APPEND_M(str( "CVTSI2SD `vector_ab, `new_u_0" ));
        }

        //tmp_0=(ab_threshold>>shift_amount)
        //also store integer in new_v_0
        shift_right(regs, {ab_threshold_0, ab_threshold_1}, tmp_1, new_v_0, tmp_3, tmp_2);

        //vector_ab_threshold=<ab_threshold_double, ab_threshold_double>
        if (!use_divide_table) {
            APPEND_M(str( "CVTSI2SD `vector_ab_threshold, `new_v_0" ));
            APPEND_M(str( "SHUFPD `vector_ab_threshold, `vector_ab_threshold, 0" ));
        }
    }

    APPEND_M(str( "JMP #", loop_label ));

    //
    //

    APPEND_M(str( "#:", exit_label ));
    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        reg_scalar tmp=regs.bind_scalar(m, "tmp");

        //if (iter==gcd_128_max_iter) goto no_progress
        APPEND_M(str( "MOV `tmp, `spill_iter" ));
        APPEND_M(str( "CMP `tmp, #", to_hex(gcd_128_max_iter) ));
        APPEND_M(str( "JE #", track_asm( "gcd_128 no progress", no_progress_label ) ));
    }
    APPEND_M(str( "JMP #", track_asm( "gcd_128 premature exit", exit_iter_0_label ) ));

    //
    //

    APPEND_M(str( "#:", start_assign_label ));

    APPEND_M(str( "MOV `new_ab_0_0, `spill_ab_start_0_0" ));
    APPEND_M(str( "MOV `new_ab_0_1, `spill_ab_start_0_1" ));
    APPEND_M(str( "MOV `new_ab_1_0, `spill_ab_start_1_0" ));
    APPEND_M(str( "MOV `new_ab_1_1, `spill_ab_start_1_1" ));
    APPEND_M(str( "MOV `ab_threshold_0, `spill_ab_threshold_0" ));
    APPEND_M(str( "MOV `ab_threshold_1, `spill_ab_threshold_1" ));

    APPEND_M(str( "JMP #", start_label ));

    //
    //

    APPEND_M(str( "#:", exit_iter_0_label ));
}


}