207 lines
8.0 KiB
C
207 lines
8.0 KiB
C
//have to pass one of these in as a macro
|
|
//#define VDF_MODE 0 //used for the final submission and correctness testing
|
|
//#define VDF_MODE 1 //used for performance or other testing
|
|
|
|
//also have to pass in one of these
|
|
//#define ENABLE_ALL_INSTRUCTIONS 1
|
|
//#define ENABLE_ALL_INSTRUCTIONS 0
|
|
|
|
//
|
|
//
|
|
|
|
//divide table
|
|
const int divide_table_index_bits=11;
|
|
const int gcd_num_quotient_bits=31; //excludes sign bit
|
|
const int data_size=31;
|
|
const int gcd_base_max_iter_divide_table=16;
|
|
|
|
//continued fraction table
|
|
const int gcd_table_num_exponent_bits=3;
|
|
const int gcd_table_num_fraction_bits=7;
|
|
const int gcd_base_max_iter=5;
|
|
|
|
#if ENABLE_ALL_INSTRUCTIONS==1
|
|
const bool use_divide_table=true;
|
|
const int gcd_base_bits=63;
|
|
const int gcd_128_max_iter=2;
|
|
#else
|
|
const bool use_divide_table=false;
|
|
const int gcd_base_bits=50;
|
|
const int gcd_128_max_iter=3;
|
|
#endif
|
|
|
|
/*
|
|
divide_table_index bits
|
|
10 - 0m1.269s
|
|
11 - 0m1.261s
|
|
12 - 0m1.262s
|
|
13 - 0m1.341s
|
|
**/
|
|
|
|
/*
|
|
gcd_base_max_iter_divide_table
|
|
13 - 0m1.290s
|
|
14 - 0m1.275s
|
|
15 - 0m1.265s
|
|
16 - 0m1.261s
|
|
17 - 0m1.268s
|
|
18 - 0m1.278s
|
|
19 - 0m1.283s
|
|
**/
|
|
|
|
/*
|
|
100k iterations; median of 3 runs. consistency between runs was very high
|
|
|
|
effect of scheduler:
|
|
taskset 0,1 : 0m1.352s (63% speedup single thread, 37% over 0,2)
|
|
taskset 0,2 : 0m1.850s
|
|
default : 0m1.348s (fastest)
|
|
single threaded : 0m2.212s [this has gone down to 0m1.496s for some reason with the divide table]
|
|
|
|
exponent fraction base_bits base_iter 128_iter seconds
|
|
3 7 50 5 3 0m1.350s [fastest with range checks enabled]
|
|
3 7 52 5 3 0m1.318s [range checks disabled; 2.4% faster]
|
|
|
|
[this block with bmi and fma disabled]
|
|
3 7 46 5 3 0m1.426s
|
|
3 7 47 5 3 0m1.417s
|
|
3 7 48 5 3 0m1.421s
|
|
3 7 49 5 3 0m1.413s
|
|
3 7 50 5 3 0m1.401s [still fastest; bmi+fma is 3.8% faster]
|
|
3 7 51 5 3 0m1.406s
|
|
3 7 52 5 3 0m1.460s
|
|
3 7 50 6 3 0m1.416s
|
|
|
|
3 7 49 6 3 0m1.376s
|
|
|
|
2 8 45 6 3 0m1.590s
|
|
2 8 49 6 3 0m1.485s
|
|
2 8 51 6 3 0m1.479s
|
|
2 8 52 6 3 0m1.501s
|
|
2 8 53 6 3 0m1.531s
|
|
2 8 54 6 3 0m13.675s
|
|
2 8 55 6 3 0m13.648s
|
|
|
|
3 7 49 2 3 0m14.571s
|
|
3 7 49 3 3 0m1.597s
|
|
3 7 49 4 3 0m1.430s
|
|
3 7 49 5 3 0m1.348s
|
|
3 7 49 6 3 0m1.376s
|
|
3 7 49 10 3 0m1.485s
|
|
|
|
3 7 49 1 18 0m2.226s
|
|
3 7 49 2 10 0m1.756s
|
|
3 7 49 3 6 0m1.557s
|
|
3 7 49 4 4 0m1.388s
|
|
3 7 49 5 4 0m1.525s
|
|
3 7 49 6 3 0m1.377s
|
|
3 7 49 7 3 0m1.446s
|
|
3 7 49 8 2 0m1.503s
|
|
|
|
3 6 45 4 3 0m15.176s
|
|
3 7 45 4 3 0m1.443s
|
|
3 8 45 4 3 0m1.386s
|
|
3 9 45 4 3 0m1.355s
|
|
3 10 45 4 3 0m1.353s
|
|
3 11 45 4 3 0m1.419s
|
|
3 12 45 4 3 0m1.451s
|
|
3 13 45 4 3 0m1.584s
|
|
|
|
3 7 40 4 2 0m1.611s
|
|
3 8 40 4 2 0m1.570s
|
|
3 9 40 4 2 0m1.554s
|
|
3 10 40 4 2 0m1.594s
|
|
3 11 40 4 2 0m1.622s
|
|
3 12 40 4 2 0m1.674s
|
|
3 13 40 4 2 0m1.832s
|
|
|
|
3 7 48 5 3 0m1.358s
|
|
3 7 49 5 3 0m1.353s
|
|
3 7 50 5 3 0m1.350s
|
|
|
|
3 8 48 5 3 0m1.366s
|
|
3 8 49 5 3 0m1.349s
|
|
3 8 50 5 3 0m1.334s
|
|
|
|
3 9 48 5 3 0m1.370s
|
|
3 9 49 5 3 0m1.349s
|
|
3 9 50 5 3 0m1.346s
|
|
|
|
3 10 48 5 3 0m1.404s
|
|
3 10 49 5 3 0m1.382s
|
|
3 10 50 5 3 0m1.379s
|
|
***/
|
|
|
|
const uint64 max_spin_counter=10000000;
|
|
|
|
//this value makes square_original not be called in 100k iterations. with every iteration reduced, minimum value is 1
|
|
const int num_extra_bits_ab=3;
|
|
|
|
const bool calculate_k_repeated_mod=false;
|
|
const bool calculate_k_repeated_mod_interval=1;
|
|
|
|
const int validate_interval=1; //power of 2. will check the discriminant in the slave thread at this interval. -1 to disable. no effect on performance
|
|
const int checkpoint_interval=10000; //at each checkpoint, the slave thread is restarted and the master thread calculates c
|
|
//checkpoint_interval=100000: 39388
|
|
//checkpoint_interval=10000: 39249 cycles per fast iteration
|
|
//checkpoint_interval=1000: 38939
|
|
//checkpoint_interval=100: 39988
|
|
//no effect on performance (with track cycles enabled)
|
|
|
|
// ==== test ====
|
|
#if VDF_MODE==1
|
|
#define VDF_TEST
|
|
const bool is_vdf_test=true;
|
|
|
|
const bool enable_random_error_injection=false;
|
|
const double random_error_injection_rate=0; //0 to 1
|
|
|
|
//#define GENERATE_ASM_TRACKING_DATA
|
|
//#define ENABLE_TRACK_CYCLES
|
|
const bool vdf_test_correctness=false;
|
|
const bool enable_threads=true;
|
|
#endif
|
|
|
|
// ==== production ====
|
|
#if VDF_MODE==0
|
|
const bool is_vdf_test=false;
|
|
|
|
const bool enable_random_error_injection=false;
|
|
const double random_error_injection_rate=0; //0 to 1
|
|
|
|
const bool vdf_test_correctness=false;
|
|
const bool enable_threads=true;
|
|
|
|
//#define ENABLE_TRACK_CYCLES
|
|
#endif
|
|
|
|
//
|
|
//
|
|
|
|
//this doesn't do anything outside of test code
|
|
//this doesn't work with the divide table currently
|
|
#define TEST_ASM
|
|
|
|
const int gcd_size=20; //multiple of 4. must be at least half the discriminant size in bits divided by 64
|
|
|
|
const int gcd_max_iterations=gcd_size*2; //typically 1 iteration per limb
|
|
|
|
const int max_bits_base=1024; //half the discriminant number of bits, rounded up
|
|
const int reduce_max_iterations=10000;
|
|
|
|
const int num_asm_tracking_data=128;
|
|
bool enable_all_instructions=ENABLE_ALL_INSTRUCTIONS;
|
|
|
|
//if the asm code doesn't use fma, the c code shouldn't either to be the same as the asm code
|
|
const bool enable_fma_in_c_code=ENABLE_ALL_INSTRUCTIONS;
|
|
|
|
const int track_cycles_num_buckets=24; //each bucket is from 2^i to 2^(i+1) cycles
|
|
const int track_cycles_max_num=128;
|
|
|
|
void mark_vdf_test() {
|
|
static bool did_warning=false;
|
|
if (!is_vdf_test && !did_warning) {
|
|
print( "test code enabled in production build" );
|
|
did_warning=true;
|
|
}
|
|
} |