chia-blockchain/lib/chiavdf/fast_vdf/asm_gcd_128.h

670 lines
25 KiB
C++

namespace asm_code {
typedef array<reg_scalar, 2> reg_scalar_128;
//v[0] is low, v[1] is high. amount is >=0 and <128. res can't alias with v
//preserves inputs. returns low part of result
//regs: RCX, 1x scalar
void shift_right(
reg_alloc regs, array<reg_scalar, 2> v, reg_scalar amount, reg_scalar res,
reg_scalar tmp_rcx, reg_scalar tmp_res_2
) {
EXPAND_MACROS_SCOPE;
m.bind(v, "v");
m.bind(amount, "amount");
m.bind(res, "res");
assert(tmp_rcx.value==reg_rcx.value);
m.bind(tmp_res_2, "res_2");
//res=uint64([v[1]:v[0]] >> amount) ; undefined if amount>=64
APPEND_M(str( "MOV RCX, `amount" ));
APPEND_M(str( "MOV `res, `v_0" ));
APPEND_M(str( "SHRD `res, `v_1, CL" ));
//res_2=0
APPEND_M(str( "XOR `res_2, `res_2" ));
//RCX=amount-64
APPEND_M(str( "SUB RCX, 64" ));
//res=(amount>=64)? 0 : res
//res_2=(amount>=64)? v[1] : 0
APPEND_M(str( "CMOVAE `res, `res_2" ));
APPEND_M(str( "CMOVAE `res_2, `v_1" ));
//res_2=(amount>=64)? 0 : v[1]>>(amount-64)
APPEND_M(str( "SHR `res_2, CL" ));
//res=(amount>=64)? res_2 : res
APPEND_M(str( "OR `res, `res_2" ));
}
//all inputs are unsigned
void dot_product_exact(reg_alloc regs, array<reg_scalar, 2> a, array<reg_scalar, 2> b, reg_scalar out, string overflow_label) {
EXPAND_MACROS_SCOPE;
m.bind(a, "a");
m.bind(b, "b");
m.bind(out, "out");
reg_scalar rax=regs.bind_scalar(m, "rax", reg_rax);
reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);
//out=a0*b0
APPEND_M(str( "MOV RAX, `a_0" ));
APPEND_M(str( "MUL `b_0" ));
APPEND_M(str( "JC #", overflow_label ));
APPEND_M(str( "MOV `out, RAX" ));
//RAX=a1*b1
APPEND_M(str( "MOV RAX, `a_1" ));
APPEND_M(str( "MUL `b_1" ));
APPEND_M(str( "JC #", overflow_label ));
//out=a0*b0+a1*b1
APPEND_M(str( "ADD `out, RAX" ));
APPEND_M(str( "JC #", overflow_label ));
}
//ab and ab_threshold reg_spill are 16 bytes (lsb first), 8 byte aligned. all others are 8 bytes
//parity is 1 if odd, else 0
//is_lehmer is 1 if true, else 0
//u, v, and parity are outputs
//regs: 15x scalar, 16x vector (i.e. all of the registers except RSP)
void gcd_128(
reg_alloc regs_parent,
array<reg_spill, 2> spill_ab_start, array<reg_spill, 2> spill_u, array<reg_spill, 2> spill_v,
reg_spill spill_parity, reg_spill spill_is_lehmer, reg_spill spill_ab_threshold,
string no_progress_label
) {
EXPAND_MACROS_SCOPE_PUBLIC;
track_asm( "gcd_128" );
m.bind(spill_ab_start[0], "spill_ab_start_0_0");
m.bind(spill_ab_start[0]+8, "spill_ab_start_0_1");
m.bind(spill_ab_start[1], "spill_ab_start_1_0");
m.bind(spill_ab_start[1]+8, "spill_ab_start_1_1");
m.bind(spill_u, "spill_u");
m.bind(spill_v, "spill_v");
m.bind(spill_parity, "spill_parity");
m.bind(spill_is_lehmer, "spill_is_lehmer");
m.bind(spill_ab_threshold, "spill_ab_threshold_0");
m.bind(spill_ab_threshold+8, "spill_ab_threshold_1");
reg_vector vector_ab=regs_parent.bind_vector(m, "vector_ab");
reg_vector vector_u=regs_parent.bind_vector(m, "vector_u");
reg_vector vector_v=regs_parent.bind_vector(m, "vector_v");
reg_vector vector_is_lehmer=regs_parent.bind_vector(m, "vector_is_lehmer");
reg_vector vector_ab_threshold=regs_parent.bind_vector(m, "vector_ab_threshold");
reg_spill spill_iter=regs_parent.bind_spill(m, "spill_iter");
APPEND_M(str( "MOV QWORD PTR `spill_u_0, 1" ));
APPEND_M(str( "MOV QWORD PTR `spill_u_1, 0" ));
APPEND_M(str( "MOV QWORD PTR `spill_v_0, 0" ));
APPEND_M(str( "MOV QWORD PTR `spill_v_1, 1" ));
APPEND_M(str( "MOV QWORD PTR `spill_parity, 0" ));
APPEND_M(str( "MOV QWORD PTR `spill_iter, #", to_hex(gcd_128_max_iter) ));
string start_label=m.alloc_label();
string loop_label=m.alloc_label();
string exit_label=m.alloc_label();
string exit_iter_0_label=m.alloc_label();
string start_assign_label=m.alloc_label();
APPEND_M(str( "JMP #", start_assign_label ));
APPEND_M(str( "#:", loop_label ));
track_asm( "gcd_128 iter" );
//4x scalar
reg_scalar new_u_0=regs_parent.bind_scalar(m, "new_u_0"); //a
reg_scalar new_u_1=regs_parent.bind_scalar(m, "new_u_1"); //b
reg_scalar new_v_0=regs_parent.bind_scalar(m, "new_v_0"); //ab_threshold
reg_scalar new_v_1=regs_parent.bind_scalar(m, "new_v_1"); //base iter
if (use_divide_table) {
string base_exit_label=m.alloc_label();
string base_loop_label=m.alloc_label();
APPEND_M(str( "MOV `new_v_1, #", to_hex(gcd_base_max_iter_divide_table) ));
APPEND_M(str( "MOVDQA `vector_u, #", constant_address_uint64(1ull, 0ull) ));
APPEND_M(str( "MOVDQA `vector_v, #", constant_address_uint64(0ull, 1ull) ));
APPEND_M(str( "#:", base_loop_label ));
gcd_64_iteration(regs_parent, vector_is_lehmer, {new_u_0, new_u_1}, {vector_u, vector_v}, new_v_0, base_exit_label);
APPEND_M(str( "DEC `new_v_1" ));
APPEND_M(str( "JNZ #", base_loop_label ));
APPEND_M(str( "#:", base_exit_label ));
APPEND_M(str( "CMP `new_v_1, #", to_hex(gcd_base_max_iter_divide_table) ));
APPEND_M(str( "JE #", track_asm( "gcd_128 base no progress", exit_label ) ));
} else {
gcd_base_continued_fraction(
regs_parent, vector_ab, vector_u, vector_v, vector_is_lehmer, vector_ab_threshold,
track_asm( "gcd_128 base no progress", exit_label )
);
}
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
//12x scalar (including dot product exact which is 2x scalar)
reg_scalar m_0_0=regs.bind_scalar(m, "m_0_0");
reg_scalar m_0_1=regs.bind_scalar(m, "m_0_1");
reg_scalar m_1_0=regs.bind_scalar(m, "m_1_0");
reg_scalar m_1_1=regs.bind_scalar(m, "m_1_1");
reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0");
reg_scalar tmp_1=regs.bind_scalar(m, "tmp_1");
reg_vector tmp_a=regs.bind_vector(m, "tmp_a");
reg_vector tmp_b=regs.bind_vector(m, "tmp_b");
reg_vector tmp_c=regs.bind_vector(m, "tmp_c");
reg_vector c_double_abs_mask=regs.bind_vector(m, "double_abs_mask");
if (!use_divide_table) {
APPEND_M(str( "MOVAPD `double_abs_mask, #", constant_address_uint64(double_abs_mask, double_abs_mask) ));
}
auto abs_tmp_a=[&]() {
if (use_divide_table) {
//tmp_b = int64 mask = int64(v)>>63;
APPEND_M(str( "MOVDQA `tmp_b, `tmp_a" ));
APPEND_M(str( "PSRAD `tmp_b, 32" )); //high 32 bits = sign bit ; low 32 bits = undefined
APPEND_M(str( "PSHUFD `tmp_b, `tmp_b, #", to_hex( 0b11110101 ) )); //move high 32 bits to low 32 bits
//abs_v=(v + mask) ^ mask;
APPEND_M(str( "PADDQ `tmp_a, `tmp_b" ));
APPEND_M(str( "PXOR `tmp_a, `tmp_b" ));
} else {
APPEND_M(str( "PAND `tmp_a, `double_abs_mask" ));
}
};
auto mov_low_tmp_a=[&](string target) {
if (use_divide_table) {
APPEND_M(str( "MOVQ `#, `tmp_a", target ));
} else {
APPEND_M(str( "CVTTSD2SI `#, `tmp_a", target ));
}
};
//<m_0_0, m_1_0>=<abs(vector_u[0]), abs(vector_u[1])>
//for the divide table, this is u[0] and v[0]
APPEND_M(str( "MOVAPD `tmp_a, `vector_u" ));
abs_tmp_a();
mov_low_tmp_a( (use_divide_table)? "m_0_0" : "m_0_0" );
APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
mov_low_tmp_a( (use_divide_table)? "m_0_1" : "m_1_0" );
//<m_1_0, m_1_1>=<abs(vector_v[0]), abs(vector_v[1])>
//for the divide table, this is u[1] and v[1]
APPEND_M(str( "MOVAPD `tmp_a, `vector_v" ));
abs_tmp_a();
mov_low_tmp_a( (use_divide_table)? "m_1_0" : "m_0_1" );
APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
mov_low_tmp_a( (use_divide_table)? "m_1_1" : "m_1_1" );
APPEND_M(str( "MOV `tmp_0, `spill_u_0" ));
APPEND_M(str( "MOV `tmp_1, `spill_u_1" ));
dot_product_exact(regs, {m_0_0, m_0_1}, {tmp_0, tmp_1}, new_u_0, track_asm( "gcd_128 uv overflow", exit_label ));
dot_product_exact(regs, {m_1_0, m_1_1}, {tmp_0, tmp_1}, new_u_1, track_asm( "gcd_128 uv overflow", exit_label ));
APPEND_M(str( "MOV `tmp_0, `spill_v_0" ));
APPEND_M(str( "MOV `tmp_1, `spill_v_1" ));
dot_product_exact(regs, {m_0_0, m_0_1}, {tmp_0, tmp_1}, new_v_0, track_asm( "gcd_128 uv overflow", exit_label ));
dot_product_exact(regs, {m_1_0, m_1_1}, {tmp_0, tmp_1}, new_v_1, track_asm( "gcd_128 uv overflow", exit_label ));
}
//9x scalar
reg_scalar new_ab_0_0=regs_parent.bind_scalar(m, "new_ab_0_0");
reg_scalar new_ab_0_1=regs_parent.bind_scalar(m, "new_ab_0_1");
reg_scalar new_ab_1_0=regs_parent.bind_scalar(m, "new_ab_1_0");
reg_scalar new_ab_1_1=regs_parent.bind_scalar(m, "new_ab_1_1");
reg_scalar new_parity=regs_parent.bind_scalar(m, "new_parity");
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
//15x scalar
reg_scalar rax=regs.bind_scalar(m, "rax", reg_rax);
reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);
reg_vector tmp_a=regs.bind_vector(m, "tmp_a");
reg_scalar ab_start_0_0=regs.bind_scalar(m, "ab_start_0_0");
reg_scalar ab_start_0_1=regs.bind_scalar(m, "ab_start_0_1");
reg_scalar ab_start_1_0=regs.bind_scalar(m, "ab_start_1_0");
reg_scalar ab_start_1_1=regs.bind_scalar(m, "ab_start_1_1");
APPEND_M(str( "MOV `ab_start_0_0, `spill_ab_start_0_0" ));
APPEND_M(str( "MOV `ab_start_0_1, `spill_ab_start_0_1" ));
APPEND_M(str( "MOV `ab_start_1_0, `spill_ab_start_1_0" ));
APPEND_M(str( "MOV `ab_start_1_1, `spill_ab_start_1_1" ));
//RAX=(uv_double[1][1]<0)? 1 : 0=uv_double_parity
//(this also works for integers with the divide table)
APPEND_M(str( "MOVAPD `tmp_a, `vector_v" ));
APPEND_M(str( "SHUFPD `tmp_a, `tmp_a, 3" ));
APPEND_M(str( "MOVQ RAX, `tmp_a" ));
APPEND_M(str( "SHR RAX, 63" ));
//new_parity=spill_parity^uv_double_parity
APPEND_M(str( "MOV `new_parity, `spill_parity" ));
APPEND_M(str( "XOR `new_parity, RAX" ));
//[out1:out0]=[a1:a0]*u - [b1:b0]*v
auto dot_product_subtract=[&](string a0, string a1, string b0, string b1, string u, string v, string out0, string out1) {
//[RDX:RAX]=a0*u
APPEND_M(str( "MOV RAX, `#", a0 ));
APPEND_M(str( "MUL `#", u ));
//[out1:out0]=a0*u
APPEND_M(str( "MOV `#, RAX", out0 ));
APPEND_M(str( "MOV `#, RDX", out1 ));
//[RDX:RAX]=a1*u
APPEND_M(str( "MOV RAX, `#", a1 ));
APPEND_M(str( "MUL `#", u ));
//[out1:out0]=a0*u + (a1*u)<<64=a*u
APPEND_M(str( "ADD `#, RAX", out1 ));
//[RDX:RAX]=b0*v
APPEND_M(str( "MOV RAX, `#", b0 ));
APPEND_M(str( "MUL `#", v ));
//[out1:out0]=a*u - b0*v
APPEND_M(str( "SUB `#, RAX", out0 ));
APPEND_M(str( "SBB `#, RDX", out1 ));
//[RDX:RAX]=b1*v
APPEND_M(str( "MOV RAX, `#", b1 ));
APPEND_M(str( "MUL `#", v ));
//[out1:out0]=a*u - b0*v - (b1*v)<<64=a*u - b*v
APPEND_M(str( "SUB `#, RAX", out1 ));
};
// uint64 uv_00=uv_uint64_new[0][0];
// uint64 uv_01=uv_uint64_new[0][1];
// int128 a_new_1=ab_start[0]; a_new_1*=uv_00;
// int128 a_new_2=ab_start[1]; a_new_2*=uv_01;
// if (uv_uint64_parity_new!=0) swap(a_new_1, a_new_2);
// int128 a_new_s=a_new_1-a_new_2;
// uint128 a_new(a_new_s);
dot_product_subtract(
"ab_start_0_0", "ab_start_0_1",
"ab_start_1_0", "ab_start_1_1",
"new_u_0", "new_v_0",
"new_ab_0_0", "new_ab_0_1"
);
// uint64 uv_10=uv_uint64_new[1][0];
// uint64 uv_11=uv_uint64_new[1][1];
// int128 b_new_1=ab_start[1]; b_new_1*=uv_11;
// int128 b_new_2=ab_start[0]; b_new_2*=uv_10;
// if (uv_uint64_parity_new!=0) swap(b_new_1, b_new_2);
// int128 b_new_s=b_new_1-b_new_2;
// uint128 b_new(b_new_s);
dot_product_subtract(
"ab_start_1_0", "ab_start_1_1",
"ab_start_0_0", "ab_start_0_1",
"new_v_1", "new_u_1",
"new_ab_1_0", "new_ab_1_1"
);
APPEND_M(str( "MOV RAX, -1" ));
APPEND_M(str( "ADD RAX, `new_parity" )); //rax=(new_parity==1)? 0 : ~0
APPEND_M(str( "NOT RAX" )); //rax=(new_parity==1)? ~0 : 0
//if (new_parity!=0) { [out1:out0]=-[out1:out0]; }
auto conditional_negate=[&](string out0, string out1) {
//flip all bits if new_parity==1
APPEND_M(str( "XOR `#, RAX", out0 ));
APPEND_M(str( "XOR `#, RAX", out1 ));
//add 1 if new_parity==1
APPEND_M(str( "ADD `#, `new_parity", out0 ));
APPEND_M(str( "ADC `#, 0", out1 ));
};
conditional_negate( "new_ab_0_0", "new_ab_0_1" );
conditional_negate( "new_ab_1_0", "new_ab_1_1" );
}
//11x scalar: new_ab, new_u, new_v, new_parity, ab_threshold
reg_scalar ab_threshold_0=regs_parent.bind_scalar(m, "ab_threshold_0");
reg_scalar ab_threshold_1=regs_parent.bind_scalar(m, "ab_threshold_1");
//flags for [a1:a0]-[b1:b0]:
//CMP a0,b0 ; sets CF if b0>a0. clears CF if b0==a0
//SBB a1,b1 ; sets CF if b>a. sets ZF if b==a. may set ZF if b<a (e.g. a1==0; b1==0; b0<a0)
//CF set: a<b
//CF cleared: a>=b
//need to swap the order for <= and >
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
//15x scalar
reg_scalar ab_delta_0=regs.bind_scalar(m, "ab_delta_0");
reg_scalar ab_delta_1=regs.bind_scalar(m, "ab_delta_1");
reg_scalar b_new_min=regs.bind_scalar(m, "b_new_min");
reg_scalar is_lehmer=regs.bind_scalar(m, "is_lehmer");
APPEND_M(str( "MOV `is_lehmer, `spill_is_lehmer" ));
//uint128 ab_delta=new_ab[0]-new_ab[1]
APPEND_M(str( "MOV `ab_delta_0, `new_ab_0_0" ));
APPEND_M(str( "MOV `ab_delta_1, `new_ab_0_1" ));
APPEND_M(str( "SUB `ab_delta_0, `new_ab_1_0" ));
APPEND_M(str( "SBB `ab_delta_1, `new_ab_1_1" ));
// assert(a_new>=b_new);
// uint128 ab_delta=a_new-b_new;
//
// even:
// +uv_00 -uv_01
// -uv_10 +uv_11
//
// uint128 v_delta=uint128(v_1)+uint128(v_0); //even: positive. odd: negative
// uint128 u_delta=uint128(u_1)+uint128(u_0); //even: negative. odd: positive
//
// uv_10 is negative if even, positive if odd
// uv_11 is positive if even, negative if odd
// bool passed_even=(b_new>=uint128(u_1) && ab_delta>=v_delta);
// bool passed_odd=(b_new>=uint128(v_1) && ab_delta>=u_delta);
//uint64 uv_delta_0=(even)? new_v_1 : new_u_1;
//uv_delta_0 stored in ab_threshold_0
APPEND_M(str( "CMP `new_parity, 0" ));
APPEND_M(str( "MOV `ab_threshold_0, `new_u_1" ));
APPEND_M(str( "CMOVE `ab_threshold_0, `new_v_1" ));
//uint64 uv_delta_1=(even)? new_v_0 : new_u_0;
//uv_delta_1 stored in ab_threshold_1
APPEND_M(str( "MOV `ab_threshold_1, `new_u_0" ));
APPEND_M(str( "CMOVE `ab_threshold_1, `new_v_0" ));
//uint64 b_new_min=(even)? new_u_1 : new_v_1;
APPEND_M(str( "MOV `b_new_min, `new_v_1" ));
APPEND_M(str( "CMOVE `b_new_min, `new_u_1" ));
//if (!is_lehmer) uv_delta=0
APPEND_M(str( "CMP `is_lehmer, 0" ));
APPEND_M(str( "CMOVE `ab_threshold_0, `is_lehmer" )); //if moved, is_lehmer==0
APPEND_M(str( "CMOVE `ab_threshold_1, `is_lehmer" ));
//if (!is_lehmer) b_new_min=0
APPEND_M(str( "CMOVE `b_new_min, `is_lehmer" ));
//[uv_delta_1:uv_delta_0]=uv_delta_0 + uv_delta_1 //v_delta if even, else u_delta
APPEND_M(str( "ADD `ab_threshold_0, `ab_threshold_1" ));
APPEND_M(str( "MOV `ab_threshold_1, 0" ));
APPEND_M(str( "ADC `ab_threshold_1, 0" ));
//if (ab_delta<uv_delta) goto exit
//clobbers ab_delta
//uv_delta (ab_threshold) not needed anymore
APPEND_M(str( "SUB `ab_delta_0, `ab_threshold_0" ));
APPEND_M(str( "SBB `ab_delta_1, `ab_threshold_1" ));
APPEND_M(str( "JC #", track_asm( "gcd_128 lehmer fail ab_delta<uv_delta", exit_label ) ));
//if (new_ab[1]<b_new_min) goto exit
//clobbers b_new_min
APPEND_M(str( "CMP `new_ab_1_0, `b_new_min" ));
APPEND_M(str( "MOV `b_new_min, `new_ab_1_1" ));
APPEND_M(str( "SBB `b_new_min, 0" ));
APPEND_M(str( "JC #", track_asm( "gcd_128 lehmer fail new_ab[1]<b_new_min", exit_label ) ));
//
//
APPEND_M(str( "MOV `ab_threshold_0, `spill_ab_threshold_0" ));
APPEND_M(str( "MOV `ab_threshold_1, `spill_ab_threshold_1" ));
//if (ab_threshold>=new_ab[0]) goto exit;
APPEND_M(str( "MOV `ab_delta_0, `ab_threshold_0" ));
APPEND_M(str( "MOV `ab_delta_1, `ab_threshold_1" ));
APPEND_M(str( "SUB `ab_delta_0, `new_ab_0_0" ));
APPEND_M(str( "SBB `ab_delta_1, `new_ab_0_1" ));
APPEND_M(str( "JNC #", track_asm( "gcd_128 went too far ab_threshold>=new_ab[0]", exit_label ) ));
//u=new_u;
APPEND_M(str( "MOV `spill_u_0, `new_u_0" ));
APPEND_M(str( "MOV `spill_u_1, `new_u_1" ));
//v=new_v;
APPEND_M(str( "MOV `spill_v_0, `new_v_0" ));
APPEND_M(str( "MOV `spill_v_1, `new_v_1" ));
//parity=new_parity;
APPEND_M(str( "MOV `spill_parity, `new_parity" ));
track_asm( "gcd_128 good iter" );
//--iter;
//if (iter==0) goto exit;
APPEND_M(str( "MOV `ab_delta_0, `spill_iter" ));
APPEND_M(str( "DEC `ab_delta_0" ));
APPEND_M(str( "MOV `spill_iter, `ab_delta_0" ));
APPEND_M(str( "JZ #", track_asm( "gcd_128 good exit", exit_iter_0_label ) ));
}
APPEND_M(str( "#:", start_label ));
//11x scalar: new_ab, new_u, new_v, new_parity, ab_threshold
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
//4x scalar
reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0", reg_rax);
reg_scalar tmp_1=regs.bind_scalar(m, "tmp_1", reg_rdx);
reg_scalar tmp_2=regs.bind_scalar(m, "tmp_2");
reg_scalar tmp_3=regs.bind_scalar(m, "tmp_3", reg_rcx);
reg_scalar ab_0_0=new_ab_0_0;
reg_scalar ab_0_1=new_ab_0_1;
reg_scalar ab_1_0=new_ab_1_0;
reg_scalar ab_1_1=new_ab_1_1;
m.bind(new_ab_0_0, "ab_0_0");
m.bind(new_ab_0_1, "ab_0_1");
m.bind(new_ab_1_0, "ab_1_0");
m.bind(new_ab_1_1, "ab_1_1");
m.bind(ab_threshold_0, "ab_threshold_0");
m.bind(ab_threshold_1, "ab_threshold_1");
//tmp_3=0
APPEND_M(str( "XOR `tmp_3, `tmp_3" ));
//tmp=ab_1-ab_threshold
APPEND_M(str( "MOV `tmp_0, `ab_1_0" ));
APPEND_M(str( "MOV `tmp_1, `ab_1_1" ));
APPEND_M(str( "SUB `tmp_0, `ab_threshold_0" ));
APPEND_M(str( "SBB `tmp_1, `ab_threshold_1" ));
//if (ab[1]<ab_threshold) goto exit
APPEND_M(str( "JC #", track_asm( "gcd_128 ab[1]<ab_threshold", exit_label ) ));
//if (ab[1]==ab_threshold) goto exit
APPEND_M(str( "MOV `tmp_2, `tmp_0" ));
APPEND_M(str( "OR `tmp_2, `tmp_1" )); //ZF set if tmp_0==0 and tmp_1==0
APPEND_M(str( "JZ #", track_asm( "gcd_128 ab[1]==ab_threshold", exit_label ) ));
//tmp_0=(ab[0][1]==0)? ab[0][0] : ab[0][1]
//tmp_1=(ab[0][1]==0)? 0 : 64
//tmp_0 can't be 0
APPEND_M(str( "MOV `tmp_0, `ab_0_1" ));
APPEND_M(str( "MOV `tmp_1, 64" ));
APPEND_M(str( "CMP `ab_0_1, 0" ));
#ifdef CHIAOSX
string cmoveq_label1=m.alloc_label();
APPEND_M(str( "JNE #", cmoveq_label1));
APPEND_M(str( "MOV `tmp_0, `ab_0_0" ));
APPEND_M(str("#:", cmoveq_label1));
string cmoveq_label2=m.alloc_label();
APPEND_M(str( "JNE #", cmoveq_label2));
APPEND_M(str( "MOV `tmp_1, `tmp_3" ));
APPEND_M(str("#:", cmoveq_label2));
#else
APPEND_M(str( "CMOVEQ `tmp_0, `ab_0_0" ));
APPEND_M(str( "CMOVEQ `tmp_1, `tmp_3" ));
#endif
//tmp_0=[first set bit index in tmp_0]
APPEND_M(str( "BSR `tmp_0, `tmp_0" ));
//tmp_0=[number of bits in ab[0]]=a_num_bits
APPEND_M(str( "ADD `tmp_1, `tmp_0" ));
APPEND_M(str( "INC `tmp_1" ));
//if (is_lehmer) {
// const int min_bits=96;
// if (a_num_bits<min_bits) {
// a_num_bits=min_bits;
// }
//}
//tmp_2=spill_is_lehmer
//tmp_0=((spill_is_lehmer)? 96 : 0)=min_bits
APPEND_M(str( "XOR `tmp_0, `tmp_0" ));
APPEND_M(str( "MOV `tmp_2, `spill_is_lehmer" ));
APPEND_M(str( "CMP `tmp_2, 0" ));
APPEND_M(str( "MOV `tmp_3, 96" ));
APPEND_M(str( "CMOVNE `tmp_0, `tmp_3" ));
APPEND_M(str( "XOR `tmp_3, `tmp_3" ));
//if (a_num_bits<min_bits) a_num_bits=min_bits;
APPEND_M(str( "CMP `tmp_1, `tmp_0" ));
APPEND_M(str( "CMOVB `tmp_1, `tmp_0" ));
//int shift_amount=a_num_bits-gcd_base_bits; [shift amount can't exceed 128-gcd_base_bits]
//if (shift_amount<0) {
// shift_amount=0;
//}
//tmp_1=a_num_bits-gcd_base_bits
APPEND_M(str( "SUB `tmp_1, #", to_hex(gcd_base_bits) ));
//if (a_num_bits<gcd_base_bits) tmp_1=0
//tmp_1=shift_amount
APPEND_M(str( "CMOVB `tmp_1, `tmp_3" ));
//vector_is_lehmer=((spill_is_lehmer | shift_amount)!=0)? <~0, ~0> : <0, 0>
APPEND_M(str( "OR `tmp_2, `tmp_1" ));
if (!use_divide_table) {
#ifdef CHIAOSX
APPEND_M(str( "LEA `tmp_3, [RIP+#]", constant_address_uint64(0ull, 0ull, false) ));
APPEND_M(str( "LEA `tmp_0, [RIP+#]", constant_address_uint64(~(0ull), ~(0ull), false) ));
#else
APPEND_M(str( "MOV `tmp_3, OFFSET FLAT:#", constant_address_uint64(0ull, 0ull, false) ));
APPEND_M(str( "MOV `tmp_0, OFFSET FLAT:#", constant_address_uint64(~(0ull), ~(0ull), false) ));
#endif
} else {
#ifdef CHIAOSX
APPEND_M(str( "LEA `tmp_3, [RIP+#]", constant_address_uint64(gcd_mask_exact[0], gcd_mask_exact[1], false) ));
APPEND_M(str( "LEA `tmp_0, [RIP+#]", constant_address_uint64(gcd_mask_approximate[0], gcd_mask_approximate[1], false) ));
#else
APPEND_M(str( "MOV `tmp_3, OFFSET FLAT:#", constant_address_uint64(gcd_mask_exact[0], gcd_mask_exact[1], false) ));
APPEND_M(str( "MOV `tmp_0, OFFSET FLAT:#", constant_address_uint64(gcd_mask_approximate[0], gcd_mask_approximate[1], false) ));
#endif
}
APPEND_M(str( "CMOVZ `tmp_0, `tmp_3" ));
APPEND_M(str( "MOVAPD `vector_is_lehmer, [`tmp_0]" ));
//vector2 ab_double{
// double(uint64(ab[0]>>shift_amount)),
// double(uint64(ab[1]>>shift_amount))
//};
//double ab_threshold_double(uint64(ab_threshold>>shift_amount));
//if (shift_amount!=0) {
// ++ab_threshold_double; [can do this with integers because the shifted ab_threshold has to fit in a double exactly]
// a is larger than ab_threshold
//}
//vector_ab=<ab_1>>shift_amount, undefined>
//also store integer in new_u_1
shift_right(regs, {ab_1_0, ab_1_1}, tmp_1, new_u_1, tmp_3, tmp_2);
if (!use_divide_table) {
APPEND_M(str( "CVTSI2SD `vector_ab, `new_u_1" ));
}
//vector_ab=<ab_1>>shift_amount, ab_1>>shift_amount>
if (!use_divide_table) {
APPEND_M(str( "SHUFPD `vector_ab, `vector_ab, 0" ));
}
//vector_ab=<ab_0>>shift_amount, ab_1>>shift_amount>
//also store integer in new_u_1
shift_right(regs, {ab_0_0, ab_0_1}, tmp_1, new_u_0, tmp_3, tmp_2);
if (!use_divide_table) {
APPEND_M(str( "CVTSI2SD `vector_ab, `new_u_0" ));
}
//tmp_0=(ab_threshold>>shift_amount)
//also store integer in new_v_0
shift_right(regs, {ab_threshold_0, ab_threshold_1}, tmp_1, new_v_0, tmp_3, tmp_2);
//vector_ab_threshold=<ab_threshold_double, ab_threshold_double>
if (!use_divide_table) {
APPEND_M(str( "CVTSI2SD `vector_ab_threshold, `new_v_0" ));
APPEND_M(str( "SHUFPD `vector_ab_threshold, `vector_ab_threshold, 0" ));
}
}
APPEND_M(str( "JMP #", loop_label ));
//
//
APPEND_M(str( "#:", exit_label ));
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
reg_scalar tmp=regs.bind_scalar(m, "tmp");
//if (iter==gcd_128_max_iter) goto no_progress
APPEND_M(str( "MOV `tmp, `spill_iter" ));
APPEND_M(str( "CMP `tmp, #", to_hex(gcd_128_max_iter) ));
APPEND_M(str( "JE #", track_asm( "gcd_128 no progress", no_progress_label ) ));
}
APPEND_M(str( "JMP #", track_asm( "gcd_128 premature exit", exit_iter_0_label ) ));
//
//
APPEND_M(str( "#:", start_assign_label ));
APPEND_M(str( "MOV `new_ab_0_0, `spill_ab_start_0_0" ));
APPEND_M(str( "MOV `new_ab_0_1, `spill_ab_start_0_1" ));
APPEND_M(str( "MOV `new_ab_1_0, `spill_ab_start_1_0" ));
APPEND_M(str( "MOV `new_ab_1_1, `spill_ab_start_1_1" ));
APPEND_M(str( "MOV `ab_threshold_0, `spill_ab_threshold_0" ));
APPEND_M(str( "MOV `ab_threshold_1, `spill_ab_threshold_1" ));
APPEND_M(str( "JMP #", start_label ));
//
//
APPEND_M(str( "#:", exit_iter_0_label ));
}
}