template int_type add_carry(int_type a, int_type b, int carry_in, int& carry_out) { assert(carry_in==0 || carry_in==1); uint128 res=uint128(a) + uint128(b) + uint128(carry_in); carry_out=int(res >> (sizeof(int_type)*8)); assert(carry_out==0 || carry_out==1); return int_type(res); } template int_type sub_carry(int_type a, int_type b, int carry_in, int& carry_out) { assert(carry_in==0 || carry_in==1); uint128 res=uint128(a) - uint128(b) - uint128(carry_in); carry_out=int(res >> (sizeof(int_type)*8)) & 1; assert(carry_out==0 || carry_out==1); return int_type(res); } template int clz(int_type a) { assert(sizeof(int_type)==4 || sizeof(int_type)==8); if (a==0) { return (sizeof(int_type)==4)? 32 : 64; } else { return (sizeof(int_type)==4)? __builtin_clz(uint32(a)) : __builtin_clzll(uint64(a)); } } uint64 mul_high(uint64 a, uint64 b) { return uint64((uint128(a)*uint128(b))>>64); } uint32 mul_high(uint32 a, uint32 b) { return uint32((uint64(a)*uint64(b))>>32); } constexpr int max_constexpr(int a, int b) { if (a>b) { return a; } else { return b; } } //all "=" operators truncate ; all operators that return a separate result will pad the result as necessary template struct fixed_integer { static const type positive_sign=0; static const type negative_sign=~type(0); type data[size+1]; //little endian; sign is first fixed_integer() { for (int x=0;x explicit fixed_integer(fixed_integer t) { for (int x=0;x fixed_integer& operator=(fixed_integer t) { return *this=fixed_integer(t); } bool is_negative() const { return !is_zero() && data[0]==negative_sign; } void set_negative(bool t_negative) { data[0]=(t_negative)? negative_sign : positive_sign; } type& operator[](int pos) { assert(pos>=0 && pos=0 && posb //there is also a fast comparison in the add function, but it has a slow path static int compare( const type* a, int size_a, type sign_a, const type* b, int size_b, type sign_b ) { int carry=0; type zero=0; //this calculates |a|-|b|. all of the resulted are or'ed together in zero for (int x=0;x|b| (|a|-|b| is positive) //same sign, positive: use res //same sign, negative: negate res //opposite signs: use res if 0, otherwise 1 if sign_a is positive, -1 if sign_a is negative int res=0; if (zero!=0) res=1; if (carry==1) res=-1; //todo //get rid of branches //this is used to implement exactly one comparison with a binary result, so that should get rid of all of these branches if (sign_a==sign_b) { if (sign_a==negative_sign) { res=-res; } } else { if (res!=0) { res=(sign_a==negative_sign)? -1 : 1; } } return res; } template int compare(fixed_integer b) const { return compare( data+1, size, data[0], b.data+1, size, b.data[0] ); } //a, b, and res can alias with each other but only if the pointers are equal //the sign is not present in a/b/res static void add( const type* a, int size_a, type sign_a, const type* b, int size_b, type sign_b, type* res, int size_res, type& sign_res ) { if (size_b>size_a) { swap(a, b); swap(size_a, size_b); swap(sign_a, sign_b); } assert(size_res>=size_a && size_a>=size_b && size_b>=1); type mask=sign_a ^ sign_b; //all 1s if opposite signs, else all 0s. this isn't affected by swapping type swap_mask=positive_sign; if (size_a==size_b) { //carry flag int size_ab=size_a; bool a_less_than_b=a[size_ab-1]=2) { a_less_than_b=a[size_ab-2] fixed_integer operator+( fixed_integer b ) const { const int output_size=max_constexpr(size, b_size)+1; fixed_integer res; add( data+1, size, data[0], b.data+1, b_size, b.data[0], res.data+1, output_size, res.data[0] ); return res; } template fixed_integer operator-( fixed_integer b ) const { const int output_size=max_constexpr(size, b_size)+1; fixed_integer res; add( data+1, size, data[0], b.data+1, b_size, negative_sign^b.data[0], res.data+1, output_size, res.data[0] ); return res; } //res=a*b+c //res can alias with c if the pointers are equal. can't alias with a //if c is null then it is all 0s static void mad( const type* a, int size_a, type b, const type* c, int size_c, type* res, int size_res ) { assert(size_res>=size_c && size_c>=size_a && size_a>=1); type previous_high=0; int carry_mul=0; int add_mul=0; for (int x=0;x=size_a)? 0 : a[x]; type this_low=this_a*b; type this_high=mul_high(this_a, b); type mul_res=add_carry(this_low, previous_high, carry_mul, carry_mul); if (x==0) { assert(mul_res==this_low && carry_mul==0); } else if (x==size_a) { assert(carry_mul==0); } else if (x>size_a) { assert(mul_res==0 && carry_mul==0); } type this_c=(x>=size_c || c==nullptr)? 0 : c[x]; type add_res=add_carry(mul_res, this_c, add_mul, add_mul); res[x]=add_res; previous_high=this_high; } } //can't overflow //two of these can implement a 1024x512 mul. for 1024x1024, need to do 2x 1024x512 in separate buffers then add them static void mad_8x8(array a, array b, array c, array& res) { for (int x=0;x<8;++x) { res[x]=c[x]; } for (int x=8;x<16;++x) { res[x]=0; } for (int x=0;x<8;++x) { //this uses a sliding window for the 8 res registers (no spilling) //-the lowest register is finished after the first addition in mad. the this_low,previous_high addition is skipped //-the highest register does not need to be loaded until the last multiplication in mad. actually this would always load 0 // so it is not done //-the total number of registers is therefore 7 //there is one register for b //the 8 a values are in registers but some or all may be spilled //need 2 registers to store the MULX result //need 1 register to store the previous high result (this is initially 0) //the this_low,previous_high add result goes into one of those registers //the mul_res,this_c result goes into the c register //total registers is 18 then; 2 are spilled //address registers: //-will just use a static 32-bit address space for most of the code. can store the stack pointer there then //-address registers are only used for b and res if the addresses are not static //-the addresses are only used at the end of the loop, so there are spare registers to load the address registers from static // memory. probably the addresses will be static though mad(&a[0], 8, b[x], &res[x], 8, &res[x], 8); } } void operator*=(type v) { mad( data+1, size, v, nullptr, size, data+1, size ); } template static fixed_integer subset( fixed_integer this_v, int start ) { const int end=start+t_size; fixed_integer res; res.data[0]=this_v.data[0]; for (int x=start;x=0 && pos=0;--x) { int pos=x-amount; (*this)[x] = (pos>=0 && pos=0 && pos>64" statement. might wrap around return; } const int bits_per_limb=sizeof(type)*8; assert(amount>0 && amount=0;--x) { type previous=(x==0)? 0 : (*this)[x-1]; (*this)[x] = ((*this)[x]<>(bits_per_limb-amount)); } } void operator>>=(int amount) { if (amount==0) { return; } const int bits_per_limb=sizeof(type)*8; assert(amount>0 && amount>amount) | (next<<(bits_per_limb-amount)); } } template fixed_integer operator*( fixed_integer b ) const { const int output_size=size+b_size; fixed_integer res; for (int x=0;x(*this, 0); r.data[0]=positive_sign; integer b_x_int(vector{b[x]}); r*=b[x]; //auto r2=subset(r, 0); //r2*=b[x]; //r=r2; integer r_int(r); integer this_int(abs(*this)); integer expected_r_int=this_int*b_x_int; assert(r_int==expected_r_int); r.left_shift_limbs(x); r_int<<=x*sizeof(type)*8; assert(r_int==integer(r)); integer res_old_int(res); //todo //figure out why this doesn't work. might have something to do with the msb being set? res+=r; //unsigned /*auto res3=res; res3+=r; auto res2=res+r; fixed_integer res4(res2);*/ /*if (integer(res3)!=integer(res4)) { print( "========" ); res3=res; res3+=r; //print( "========" ); auto res2_copy=res+r; assert(false); }*/ //res=res4; integer res_new_int(res); assert(res_new_int==res_old_int+r_int); } res.data[0]=data[0] ^ b.data[0]; return res; } fixed_integer operator<<(int num) const { auto res=subset(*this, 0); res<<=num; return res; } //this rounds to 0 so it is different from division unless the input is divisible by 2^num fixed_integer operator>>(int num) const { auto res=subset(*this, 0); res>>=num; return res; } bool is_zero() const { for (int x=0;x bool operator>=(fixed_integer b) const { return compare(b)>=0; } template bool operator==(fixed_integer b) const { return compare(b)==0; } template bool operator<(fixed_integer b) const { return compare(b)<0; } template bool operator<=(fixed_integer b) const { return compare(b)<=0; } template bool operator>(fixed_integer b) const { return compare(b)>0; } template bool operator!=(fixed_integer b) const { return compare(b)!=0; } //"0" has 1 bit int num_bits() const { type v=0; int num_full=0; for (int x=size-1;x>=0;--x) { if (v==0) { v=(*this)[x]; num_full=x; } } int v_bits; if (v==0) { v_bits=1; assert(num_full==0); } else if (sizeof(v)==8) { v_bits=64-__builtin_clzll(v); } else{ assert(sizeof(v)==4); v_bits=32-__builtin_clz(v); } return num_full*sizeof(type)*8 + v_bits; } type window(int start_bit) const { int bits_per_limb_log2=(sizeof(type)==8)? 6 : 5; int bits_per_limb=1<>bits_per_limb_log2; int start_offset=start_bit&(bits_per_limb-1); auto get_limb=[&](int pos) -> type { assert(pos>=0); return (pos>=size)? type(0) : (*this)[pos]; }; type start=get_limb(start_limb)>>(start_offset); //the shift is undefined for start_offset==0 type end=get_limb(start_limb+1)<<(bits_per_limb-start_offset); return (start_offset==0)? start : (start | end); } }; template fixed_integer abs(fixed_integer v) { v.set_negative(false); return v; } template fixed_integer to_uint64(fixed_integer v) { fixed_integer res; res.set_negative(v.is_negative()); //sign extend data[0]. can just make data[0] 64 bits if i actually have to do this //this just copies the bytes over for (int x=0;x>1]=uint64(high)<<32 | uint64(low); } return res; } template fixed_integer to_uint32(fixed_integer v) { fixed_integer res; res.set_negative(v.is_negative()); //lower 32 bits of data[0] for (int x=0;x>32); } return res; }