From f95ffeab0c8cb5b0407c96d877578534c63e4efc Mon Sep 17 00:00:00 2001 From: bsdevlin Date: Sat, 23 Mar 2019 18:25:42 -0400 Subject: [PATCH] Updates to secp256k1 core --- ip_cores/util/src/rtl/karatsuba_ofman_mult.sv | 58 +++-- ip_cores/util/src/rtl/packet_arb.sv | 2 +- .../util/src/tb/karatsuba_ofman_mult_tb.sv | 43 ++++ zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv | 47 +++++ .../src/rtl/secp256k1/secp256k1_point_add.sv | 198 ++++++++++++------ .../src/rtl/secp256k1/secp256k1_point_mult.sv | 10 +- zcash_fpga/src/tb/secp256k1_point_add_tb.sv | 187 +++++++++++++++++ zcash_fpga/src/tb/secp256k1_point_mult_tb.sv | 16 +- 8 files changed, 466 insertions(+), 95 deletions(-) create mode 100644 zcash_fpga/src/tb/secp256k1_point_add_tb.sv diff --git a/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv b/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv index b54599b..d4f3640 100644 --- a/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv +++ b/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv @@ -1,7 +1,7 @@ /* Multiplication using Karatsuba-Ofman algorithm. - Multiple of these can be instantiated, each one takes 2 clocks cycles + Multiple of these can be instantiated, each one takes 3 clocks cycles per level. Fully pipelined so can accept a new input every clock. Copyright (C) 2019 Benjamin Devlin and Zcash Foundation @@ -42,17 +42,31 @@ localparam HBITS = BITS/2; logic [BITS-1:0] m0, m1, m2, dat_a, dat_b; logic [BITS*2-1:0] q; logic [HBITS-1:0] a0, a1; -logic sign, sign_; -logic val; -logic [CTL_BITS-1:0] ctl; +logic sign, sign_, sign_1; +logic val, val_, val_1; +logic [CTL_BITS-1:0] ctl, ctl_, ctl_1; +logic [HBITS-1:0] a0_, a1_; +logic [BITS-1:0] m0_, m1_, m2_; always_ff @ (posedge i_clk) begin dat_a <= i_dat_a; dat_b <= i_dat_b; o_dat <= q; - o_val <= val; - o_ctl <= ctl; + o_val <= val_1; + o_ctl <= ctl_1; + + val_ <= val; + val_1 <= val_; + ctl_ <= ctl; + ctl_1 <= ctl_; + + a0_ <= a0; + a1_ <= a1; + + m0_ <= m0; + m1_ <= m1; + m2_ <= m2; end generate @@ -61,27 +75,29 @@ generate a1 = i_dat_b[HBITS +: HBITS] > i_dat_b[0 +: HBITS] ? i_dat_b[HBITS +: HBITS] - i_dat_b[0 +: HBITS] : i_dat_b[0 +: HBITS] - i_dat_b[HBITS +: HBITS]; sign_ = ((i_dat_a[0 +: HBITS] < i_dat_a[HBITS +: HBITS]) ^ (i_dat_b[HBITS +: HBITS] < i_dat_b[0 +: HBITS])); - q = (m0 << BITS) + ((m0 + m2 + (sign == 1 ? -m1 : m1)) << HBITS) + m2; + q = (m0_ << BITS) + ((m0_ + m2_ + (sign == 1 ? -m1_ : m1_)) << HBITS) + m2_; end if (LEVEL == 1) begin: GEN_REC + always_comb begin - m0 = i_dat_a[HBITS +: HBITS] * i_dat_b[HBITS +: HBITS]; - m2 = i_dat_a[0 +: HBITS] * i_dat_b[0 +: HBITS]; - m1 = (a0 * a1); - sign = sign_; + m0 = dat_a[HBITS +: HBITS] * dat_b[HBITS +: HBITS]; + m2 = dat_a[0 +: HBITS] * dat_b[0 +: HBITS]; + m1 = (a0_ * a1_); o_rdy = i_rdy; val = i_val; ctl = i_ctl; end + always_ff @ (posedge i_clk) begin + sign <= sign_1; + sign_1 <= sign_; + end - end else begin // pipeline the other non-mult values x clock cycles and add them after multipliers - logic [LEVEL-2:0] sign_r; - + logic [LEVEL*3-1:0] sign_r; always_comb begin - sign = sign_r[LEVEL-2]; + sign = sign_r[LEVEL*3-2]; end always_ff @ (posedge i_clk) begin @@ -95,8 +111,8 @@ generate ) karatsuba_ofman_mult_m0 ( .i_clk ( i_clk ), - .i_dat_a ( i_dat_a[HBITS +: HBITS] ), - .i_dat_b ( i_dat_b[HBITS +: HBITS] ), + .i_dat_a ( dat_a[HBITS +: HBITS] ), + .i_dat_b ( dat_b[HBITS +: HBITS] ), .i_val ( i_val ), .o_val ( val ), .i_ctl ( i_ctl ), @@ -113,8 +129,8 @@ generate ) karatsuba_ofman_mult_m2 ( .i_clk ( i_clk ), - .i_dat_a ( i_dat_a[0 +: HBITS] ), - .i_dat_b ( i_dat_b[0 +: HBITS] ), + .i_dat_a ( dat_a[0 +: HBITS] ), + .i_dat_b ( dat_b[0 +: HBITS] ), .i_val ( i_val ), .o_val (), .i_ctl ( 1'd0 ), @@ -131,8 +147,8 @@ generate ) karatsuba_ofman_mult_m1 ( .i_clk ( i_clk ), - .i_dat_a ( a0 ), - .i_dat_b ( a1 ), + .i_dat_a ( a0_ ), + .i_dat_b ( a1_ ), .i_val ( i_val ), .o_val (), .i_ctl ( 1'd0 ), diff --git a/ip_cores/util/src/rtl/packet_arb.sv b/ip_cores/util/src/rtl/packet_arb.sv index 82fab0c..072f49c 100644 --- a/ip_cores/util/src/rtl/packet_arb.sv +++ b/ip_cores/util/src/rtl/packet_arb.sv @@ -49,8 +49,8 @@ generate // Optionally pipeline the input if (PIPELINE == 0) begin: PIPELINE_GEN + always_comb i_axi[g].rdy = rdy[g]; always_comb begin - i_axi[g].rdy = rdy[g]; val[g] = i_axi[g].val; eop[g] = i_axi[g].eop; sop[g] = i_axi[g].sop; diff --git a/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv b/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv index e80e065..0d0c04f 100644 --- a/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv +++ b/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv @@ -69,6 +69,48 @@ karatsuba_ofman_mult ( .o_dat ( out_if.dat ) ); +task test_pipeline(); +begin + + $display("Running test_pipeline..."); + fork + begin + logic [255:0] in_a, in_b; + integer i = 1; + integer max = 10; + while (i < max) begin + in_a = i; + in_b = i; + //in_if.put_stream({in_b, in_a}, 512/8, i); + in_if.sop = 1; + in_if.eop = 1; + in_if.ctl = i; + in_if.dat = {in_a, in_b}; + in_if.val = 1; + @(posedge in_if.i_clk); + i = i + 1; + end + in_if.val = 0; + end + begin + integer i = 1; + integer max = 10; + integer signed get_len; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected, get_dat; + while (i < max) begin + expected = i*i; + out_if.get_stream(get_dat, get_len); + common_pkg::compare_and_print(get_dat, expected); + $display("test_pipeline PASSED loop %d/%d", i, max); + i = i + 1; + end + end + join + + $display("test_pipeline PASSED"); +end +endtask; + task test_loop(); begin integer signed get_len; @@ -104,6 +146,7 @@ initial begin in_if.val = 0; #(40*CLK_PERIOD); + test_pipeline(); test_loop(); #1us $finish(); diff --git a/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv b/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv index dae199c..4f87e34 100644 --- a/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv +++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv @@ -65,6 +65,8 @@ package secp256k1_pkg; function jb_point_t dbl_jb_point(jb_point_t p); logic signed [512:0] I_X, I_Y, I_Z, A, B, C, D, X, Y, Z; + if (p.z == 0) return p; + I_X = p.x; I_Y = p.y; I_Z = p.z; @@ -83,6 +85,51 @@ package secp256k1_pkg; return dbl_jb_point; endfunction + function jb_point_t add_jb_point(jb_point_t p1, p2); + logic signed [512:0] A, U1, U2, S1, S2, H, H3, R; + + if (p1.z == 0) return p2; + if (p2.z == 0) return p1; + + if (p1.y == p2.y && p1.x == p2.x) + return (dbl_jb_point(p1)); + + U1 = p1.x*p2.z % p_eq; + U1 = U1*p2.z % p_eq; + + U2 = p2.x*p1.z % p_eq; + U2 = U2 *p1.z % p_eq; + S1 = p1.y *p2.z % p_eq; + S1 = (S1*p2.z % p_eq) *p2.z % p_eq; + S2 = p2.y * p1.z % p_eq; + S2 = (S2*p1.z % p_eq) *p1.z % p_eq; + + H = U2 + (U1 > U2 ? p_eq : 0) -U1; + R = S2 + (S1 > S2 ? p_eq : 0) -S1; + //$display("R = %x", R); + //$display("H = %x", H); + //$display("H^2 = %x", (H * H %p_eq )); + H3 = ((H * H %p_eq ) * H ) % p_eq; + A = (((2*U1 % p_eq) *H % p_eq) * H % p_eq); + + add_jb_point.z = ((H * p1.z % p_eq) * p2.z) % p_eq; + add_jb_point.x = R*R % p_eq; + + //$display("R^2 = %x", add_jb_point.x); + //$display("H^3 = %x", H3); + + add_jb_point.x = add_jb_point.x + (H3 > add_jb_point.x ? p_eq : 0) - H3; + add_jb_point.x = add_jb_point.x + (A > add_jb_point.x ? p_eq : 0) - A; + + A = (U1*H % p_eq) * H % p_eq; + A = A + (add_jb_point.x > A ? p_eq : 0) - add_jb_point.x; + A = A*R % p_eq; + add_jb_point.y = S1*H3 % p_eq; + + add_jb_point.y = A + (add_jb_point.y > A ? p_eq : 0) - add_jb_point.y; + + endfunction + function on_curve(jb_point_t p); return (p.y*p.y - p.x*p.x*p.x - secp256k1_pkg::a*p.x*p.z*p.z*p.z*p.z - secp256k1_pkg::b*p.z*p.z*p.z*p.z*p.z*p.z); endfunction diff --git a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_add.sv b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_add.sv index b08c2fd..eb4588d 100644 --- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_add.sv +++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_add.sv @@ -41,32 +41,54 @@ module secp256k1_point_add ); /* - * These are the equations that need to be computed, they are issued as variables - * become valid. We have a bitmask to track what equation results are valid which - * will trigger other equations. [] show what equations must be valid before this starts. - * We reuse input points (as they are latched) when possible to reduce register usage. - * - * 0. A = i_p1.y - i_p2.y mod p - * 1. B = i_p1.x - i_p2.x mod p - * 2. o_p.z = B * i_p1.z mod p [eq1] - * 3. i_p1.z = B * B mod p [eq2] - * 4. i_p2.x = A * A mod p [eq0, eq5] - * 5. o_p.x = i_p1.x + i_p2.x mod p - * 6. o_p.x = o_p.x * i_p1.z mod p [eq5, eq3] - * 7. o_p.x = i_p2.x - o_p.x mod p[eq6, eq4] - * 8. o_p.y = i_p1.x*i_p1.z mod p [eq3] - * 9. o_p.y = o_p.y - o_p.x mod p [eq3, eq7, eq8] - * 10. o_p.y = o_p.y * A mod p [eq0, eq9] - * 11. i_p2.y = B * i_p1.z mod p [eq1, eq3, eq0] - * 12. i_p2.y = i_p2.y * i_p1.y [eq11] - * 13. o_p.y = o_p.y - i_p2.y mod p [eq12, eq10] - */ + These are the equations that need to be computed, they are issued as variables + become valid. We have a bitmask to track what equation results are valid which + will trigger other equations. [] show what equations must be valid before this starts. + We reuse input points (as they are latched) when possible to reduce register usage. + Taken from https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates + + U1 = X1*Z2^2 + U2 = X2*Z1^2 + S1 = Y1*Z2^3 + S2 = Y2*Z1^3 + H = U2 - U1 + R = S2 - S1 + X3 = R^2 - H^3 - 2*U1*H^2 + Y3 = R*(U1*H^2 - X3) - S1*H^3 + Z3 = H*Z1*Z2 + + + 0. A = i_p2.z*i_p2.z mod p + 1. i_p1.x = A * i_p1.x mod p [eq0] ..U1 + 2. C = i_p1.z*i_p1.z mod p + 3. i_p2.x = C * i_p2.x mod p [eq2] ... U2 + 4. A = A * i_p2.z mod p [eq1] + 5. A = A * i_p1.y [eq4] ... S1 + 6. C = C * i_p1.z mod p [eq3] + 7. C = C * i_p2.y mod p [eq6] .. S2 + 8. i_p1.y = i_p2.x - i_p1.x mod p [eq3, eq1, eq5] .. H + 9. i_p2.y = C - A mod p [eq5,eq7] ... R + 10. o_p.x = i_p2.y * i_p2.y mod p [eq9] ... R^2 + 11. C = i_p1.y * i_p1.y mod p [eq9] .. H^2 + 12. i_p2.x = C * i_p1.y mod p [eq8, eq11] ..H^3 + 13. o_p.x = o_p.x - i_p2.x mod p [eq12, eq10] + 14. i_p1.x = i_p1.x*C [eq11, eq8] ..U1*H^2 + 15. o_p.y = i_p1.x [eq14] + 16. i_p1.x = 2* i_p1.x mod p [eq15, eq14] + 17. o_p.x = o_p.x - i_p1.x [eq16, eq13] + 18. o_p.y = o_p.y - o_p.x mod p [eq17, eq15] + 19. o_p.y = o_p.y * i_p2.y mod p [eq18, eq9] + 20. i_p2.x = i_p2.x * A [eq5, eq12] + 21. o_p.y = o_p.y - i_p2.x [eq20, eq19] + 22. o_p.z = i_p1.z * i_p2.z mod p + 23. o_p.z = o_p.z * i_p1.y mod p [eq22, eq8] + */ // We also check in the inital state if one of the inputs is "None" (.z == 0), and set the output to the other point -logic [13:0] eq_val, eq_wait; +logic [23:0] eq_val, eq_wait; // Temporary variables -logic [255:0] A, B; +logic [255:0] A, C; jb_point_t i_p1_l, i_p2_l; always_comb begin @@ -99,7 +121,7 @@ always_ff @ (posedge i_clk) begin i_p2_l <= 0; o_err <= 0; A <= 0; - B <= 0; + C <= 0; end else begin if (o_mult_if.rdy) o_mult_if.val <= 0; @@ -115,7 +137,7 @@ always_ff @ (posedge i_clk) begin i_p1_l <= i_p1; i_p2_l <= i_p2; A <= 0; - B <= 0; + C <= 0; if (i_val && o_rdy) begin state <= START; o_rdy <= 0; @@ -145,11 +167,11 @@ always_ff @ (posedge i_clk) begin i_mod_if.rdy <= 1; i_mult_if.rdy <= 1; - // Check any results from multiplier + // Check any results from modulo if (i_mod_if.val && i_mod_if.rdy) begin eq_val[i_mod_if.ctl] <= 1; case(i_mod_if.ctl) - 5: o_p.x <= i_mod_if.dat; + 16: i_p1_l.x <= i_mod_if.dat; default: o_err <= 1; endcase end @@ -158,67 +180,107 @@ always_ff @ (posedge i_clk) begin if (i_mult_if.val && i_mult_if.rdy) begin eq_val[i_mult_if.ctl] <= 1; case(i_mult_if.ctl) inside - 2: o_p.z <= i_mult_if.dat; - 3: i_p1_l.z <= i_mult_if.dat; - 4: i_p2_l.x <= i_mult_if.dat; - 6: o_p.x <= i_mult_if.dat; - 8: o_p.y <= i_mult_if.dat; - 10: o_p.y <= i_mult_if.dat; - 11: i_p1_l.y <= i_mult_if.dat; - 12: i_p2_l.y <= i_mult_if.dat; + 0: A <= i_mult_if.dat; + 1: i_p1_l.x <= i_mult_if.dat; + 2: C <= i_mult_if.dat; + 3: i_p2_l.x <= i_mult_if.dat; + 4: A <= i_mult_if.dat; + 5: A <= i_mult_if.dat; + 6: C <= i_mult_if.dat; + 7: C <= i_mult_if.dat; + 10: o_p.x <= i_mult_if.dat; + 11: C <= i_mult_if.dat; + 12: i_p2_l.x <= i_mult_if.dat; + 14: i_p1_l.x <= i_mult_if.dat; + 19: o_p.y <= i_mult_if.dat; + 20: i_p2_l.x <= i_mult_if.dat; + 22: o_p.z <= i_mult_if.dat; + 23: o_p.z <= i_mult_if.dat; default: o_err <= 1; endcase end // Issue new multiplies - if (eq_val[1] && ~eq_wait[2]) begin // 2. o_p.z = B * i_p1.z mod p [eq1] - multiply(2, B, i_p1_l.z); + if (~eq_wait[0]) begin // 0. A = i_p2.z*i_p2.z mod p + multiply(0, i_p2_l.z, i_p2_l.z); end else - if (eq_val[2] && ~eq_wait[3]) begin // 3. i_p1.z = B * B mod p [eq2] - multiply(3, B, B); + if (eq_val[0] && ~eq_wait[1]) begin // 1. i_p1.x = A * i_p1.x mod p [eq0] ..U1 + multiply(1, A, i_p1_l.x); end else - if (eq_val[0] && eq_val[5] && ~eq_wait[4]) begin // 4. i_p2.x = A * A mod p [eq0, eq5] - multiply(4, A, A); + if (~eq_wait[2]) begin // 2. C = i_p1.z*i_p1.z mod p + multiply(2, i_p1_l.z, i_p1_l.z); end else - if (eq_val[3] && eq_val[5] && ~eq_wait[6]) begin // 6. o_p.x = o_p.x * i_p1.z mod p [eq5, eq3] - multiply(6, o_p.x, i_p1_l.z); + if (eq_val[2] && ~eq_wait[3]) begin // 3. i_p2.x = C * i_p2.x mod p [eq2] ... U2 + multiply(3, C, i_p2_l.x); end else - if (eq_val[3] && ~eq_wait[8]) begin // 8. o_p.y = i_p1.x*i_p1.z mod p [eq3] - multiply(8, i_p1_l.x, i_p1_l.z); + if (eq_val[1] && ~eq_wait[4]) begin // 4. A = A * i_p2.z mod p [eq1] + multiply(4, A, i_p2_l.z); end else - if (eq_val[0] && eq_val[9] && ~eq_wait[10]) begin // 10. o_p.y = o_p.y * A mod p [eq0, eq9] - multiply(10, o_p.y, A); + if (eq_val[4] && ~eq_wait[5]) begin // 5. A = A * i_p1.y [eq4] ... S1 + multiply(5, A, i_p1_l.y); end else - if (eq_val[0] && eq_val[1] && eq_val[3] && ~eq_wait[11]) begin // 11. i_p2.y = B * i_p1.z mod p [eq1, eq3, eq0] - multiply(11, B, i_p1_l.z); + if (eq_val[3] && ~eq_wait[6]) begin // 6. C = C * i_p1.z mod p [eq3] + multiply(6, C, i_p1_l.z); end else - if (eq_val[11] && ~eq_wait[12]) begin // 12. i_p2.y = i_p2.y * i_p1.y [eq11] - multiply(12, i_p1_l.y, i_p2_l.y); + if (eq_val[6] && ~eq_wait[7]) begin // 7. C = C * i_p2.y mod p [eq6] .. S2 + multiply(7, C, i_p2_l.y); + end else + if (eq_val[9] && ~eq_wait[10]) begin // 10. o_p.x = i_p2.y * i_p2.y mod p [eq9] + multiply(10, i_p2_l.y, i_p2_l.y); + end else + if (eq_val[9] && ~eq_wait[11]) begin // 11. C = i_p1.y * i_p1.y mod p [eq9] .. H^2 + multiply(11, i_p1_l.y, i_p1_l.y); + end else + if (eq_val[11] && eq_val[8] && ~eq_wait[12]) begin // 12. i_p2.x = C * i_p1.y mod p [eq8, eq11] ..H^3 + multiply(12, C, i_p1_l.y); + end else + if (eq_val[11] && eq_val[8] && ~eq_wait[14]) begin // 14. i_p1.x = i_p1.x*C [eq11, eq8] ..U1*H^2 + multiply(14, C, i_p1_l.x); + end else + if (eq_val[18] && eq_val[9] && ~eq_wait[19]) begin // 19. o_p.y = o_p.y * i_p2.y mod p [eq18, eq9] + multiply(19, o_p.y, i_p2_l.y); + end else + if (eq_val[5] && eq_val[12] && ~eq_wait[20]) begin // 20. i_p2.x = i_p2.x * A [eq5, eq12] + multiply(20, i_p2_l.x, A); + end else + if (~eq_wait[22]) begin // 22. o_p.z = i_p1.z * i_p2.z mod p + multiply(22, i_p1_l.z, i_p2_l.z); + end else + if (eq_val[8] && eq_val[22] && ~eq_wait[23]) begin // 23. o_p.z = o_p.z * i_p1.y mod p [eq22, eq8] + multiply(23, o_p.z, i_p1_l.y); end - + // Issue new modulo reductions - if (~eq_wait[5]) begin // 5. o_p.x = i_p1.x + i_p2.x mod p - modulo(5, i_p1.x + i_p2.x); + if (eq_val[15] && eq_val[14] && ~eq_wait[16]) begin // 16. i_p1.x = 2* i_p1.x mod p [eq15, eq14] + modulo(16, 2 * i_p1_l.x); end // Subtractions we do in-module - if (~eq_wait[0]) begin //0. A = i_p1.y - i_p2.y mod p - A <= subtract(0, i_p1_l.y, i_p2_l.y); + if (eq_val[1] && eq_val[3] && eq_val[5] && ~eq_wait[8]) begin //8. i_p1.y = i_p2.x - i_p1.x mod p [eq3, eq1, eq5] .. H + i_p1_l.y <= subtract(8, i_p2_l.x, i_p1_l.x); end - if (~eq_wait[1]) begin //1. B = i_p1.x - i_p2.x mod p - B <= subtract(1, i_p1_l.x, i_p2_l.x); - end - if (~eq_wait[7] && eq_val[6] && eq_val[4]) begin //7. o_p.x = i_p2.x - o_p.x mod p[eq6, eq4] - o_p.x <= subtract(7, i_p2_l.x, o_p.x); - end - if (~eq_wait[9] && eq_val[3] && eq_val[7] && eq_val[8]) begin //9. o_p.y = o_p.y - o_p.x mod p [eq3, eq7, eq8] - o_p.y <= subtract(9, o_p.y, o_p.x); - end - if (~eq_wait[13] && eq_val[12] && eq_val[10]) begin //13. o_p.y = o_p.y - i_p2.y mod p [eq12, eq10] - o_p.y <= subtract(13, o_p.y, i_p2_l.y); + if (eq_val[5] && eq_val[7] && ~eq_wait[9]) begin //9. i_p2.y = C - A mod p [eq5,eq7] ... R + i_p2_l.y <= subtract(9, C, A); + end + if (eq_val[12] && eq_val[10] && ~eq_wait[13]) begin //13. o_p.x = o_p.x - i_p2.x mod p [eq12, eq10] + o_p.x <= subtract(13, o_p.x, i_p2_l.x); + end + if (eq_val[16] && eq_val[13] && ~eq_wait[17]) begin //17. o_p.x = o_p.x - i_p1.x [eq16, eq13] + o_p.x <= subtract(17, o_p.x, i_p1_l.x); end - - + if (eq_val[17] && eq_val[15] && ~eq_wait[18]) begin //18. o_p.y = o_p.y - o_p.x mod p [eq17, eq15] + o_p.y <= subtract(18, o_p.y, o_p.x); + end + if (eq_val[20] && eq_val[19] && ~eq_wait[21]) begin //21. o_p.y = o_p.y - i_p2.x [eq20, eq19] + o_p.y <= subtract(21, o_p.y, i_p2_l.x); + end + + // Assignments + if (eq_val[14] && ~eq_wait[15]) begin //15. o_p.y = i_p1.x [eq14] + eq_wait[15] <= 1; + eq_val[15] <= 1; + o_p.y <= i_p1_l.x; + end if (&eq_val) begin state <= FINISHED; diff --git a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv index 0a74059..a6bd779 100644 --- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv +++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv @@ -116,7 +116,11 @@ always_ff @ (posedge i_clk) begin p_add_done <= 1; end - p_dbl_in_val <= 1; + // Don't need to double on the final bit + if ((k_l >> 1) != 0) + p_dbl_in_val <= 1; + else + p_dbl_done <= 1; if (k_l == 0) begin state <= FINISHED; @@ -189,7 +193,7 @@ packet_arb # ( .DAT_BYTS ( 512/8 ), .CTL_BITS ( 8 ), .NUM_IN ( 2 ), - .PIPELINE ( 1 ) + .PIPELINE ( 0 ) ) packet_arb_mult ( .i_clk ( i_clk ), @@ -202,7 +206,7 @@ packet_arb # ( .DAT_BYTS ( 512/8 ), .CTL_BITS ( 8 ), .NUM_IN ( 2 ), - .PIPELINE ( 1 ) + .PIPELINE ( 0 ) ) packet_arb_mod ( .i_clk ( i_clk ), diff --git a/zcash_fpga/src/tb/secp256k1_point_add_tb.sv b/zcash_fpga/src/tb/secp256k1_point_add_tb.sv new file mode 100644 index 0000000..28f013a --- /dev/null +++ b/zcash_fpga/src/tb/secp256k1_point_add_tb.sv @@ -0,0 +1,187 @@ +/* + Copyright (C) 2019 Benjamin Devlin and Zcash Foundation + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +`timescale 1ps/1ps + +module secp256k1_point_add_tb (); +import common_pkg::*; +import secp256k1_pkg::*; + +localparam CLK_PERIOD = 1000; + +logic clk, rst; + +if_axi_stream #(.DAT_BYTS(256*6/8)) in_if(clk); // Two points +if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk); + +if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(clk); +if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(clk); + +if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mod_in_if(clk); +if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mod_out_if(clk); + + +jb_point_t in_p1, in_p2, out_p; + +always_comb begin + in_p1 = in_if.dat[0 +: 256*3]; + in_p2 = in_if.dat[256*3 +: 256*3]; + out_if.dat = out_p; +end + +initial begin + rst = 0; + repeat(2) #(20*CLK_PERIOD) rst = ~rst; +end + +initial begin + clk = 0; + forever #CLK_PERIOD clk = ~clk; +end + +always_comb begin + out_if.sop = 1; + out_if.eop = 1; + out_if.ctl = 0; + out_if.mod = 0; +end + +// Check for errors +always_ff @ (posedge clk) + if (out_if.val && out_if.err) + $error(1, "%m %t ERROR: output .err asserted", $time); + +secp256k1_point_add secp256k1_point_add( + .i_clk ( clk ), + .i_rst ( rst ), + // Input points + .i_p1 ( in_p1 ), + .i_p2 ( in_p2 ), + .i_val ( in_if.val ), + .o_rdy ( in_if.rdy ), + .o_p ( out_p ), + .o_err ( out_if.err ), + .i_rdy ( out_if.rdy ), + .o_val ( out_if.val ) , + .o_mult_if ( mult_in_if ), + .i_mult_if ( mult_out_if ), + .o_mod_if ( mod_in_if ), + .i_mod_if ( mod_out_if ) +); + +// Attach a mod reduction unit and multiply - mod unit +// In full design these could use dedicated multipliers or be arbitrated +secp256k1_mult_mod #( + .CTL_BITS ( 8 ) +) +secp256k1_mult_mod ( + .i_clk ( clk ), + .i_rst ( rst ), + .i_dat_a ( mult_in_if.dat[0 +: 256] ), + .i_dat_b ( mult_in_if.dat[256 +: 256] ), + .i_val ( mult_in_if.val ), + .i_err ( mult_in_if.err ), + .i_ctl ( mult_in_if.ctl ), + .o_rdy ( mult_in_if.rdy ), + .o_dat ( mult_out_if.dat ), + .i_rdy ( mult_out_if.rdy ), + .o_val ( mult_out_if.val ), + .o_ctl ( mult_out_if.ctl ), + .o_err ( mult_out_if.err ) +); + +secp256k1_mod #( + .USE_MULT ( 0 ), + .CTL_BITS ( 8 ) +) +secp256k1_mod ( + .i_clk( clk ), + .i_rst( rst ), + .i_dat( mod_in_if.dat ), + .i_val( mod_in_if.val ), + .i_err( mod_in_if.err ), + .i_ctl( mod_in_if.ctl ), + .o_rdy( mod_in_if.rdy ), + .o_dat( mod_out_if.dat ), + .o_ctl( mod_out_if.ctl ), + .o_err( mod_out_if.err ), + .i_rdy( mod_out_if.rdy ), + .o_val( mod_out_if.val ) +); + +task test_0(); +begin + integer signed get_len; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected, get_dat; + logic [255:0] in_a, in_b; + jb_point_t p1, p2, p_exp, p_temp, p_out; + $display("Running test_0..."); + + //p1 = {x:3, y:4, z:1}; + // p2 = {x:1, y:2, z:1}; + + /*p1 = {x:256'h79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798, + y:256'h483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8, + z:256'h0000000000000000000000000000000000000000000000000000000000000001}; + p2 = {x:256'h7d152c041ea8e1dc2191843d1fa9db55b68f88fef695e2c791d40444b365afc2, + y:256'h56915849f52cc8f76f5fd7e4bf60db4a43bf633e1b1383f85fe89164bfadcbdb, + z:256'h9075b4ee4d4788cabb49f7f81c221151fa2f68914d0aa833388fa11ff621a970}; + */ + p1 = {x:256'h79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798, + y:256'h483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8, + z:256'h1}; + p2 = {x:256'h7d152c041ea8e1dc2191843d1fa9db55b68f88fef695e2c791d40444b365afc2, + y:256'h56915849f52cc8f76f5fd7e4bf60db4a43bf633e1b1383f85fe89164bfadcbdb, + z:256'h9075b4ee4d4788cabb49f7f81c221151fa2f68914d0aa833388fa11ff621a970}; + + + p_exp = add_jb_point(p1, p2); + + fork + in_if.put_stream({p2, p1}, 256*6/8); + out_if.get_stream(get_dat, get_len); + join + + p_out = get_dat; + + $display("%d %d %d", on_curve(p1), on_curve(p2), on_curve(p_out));//, on_curve(p_temp)); + + if (p_exp != p_out) begin + $display("Expected:"); + print_jb_point(p_exp); + $display("Was:"); + print_jb_point(p_out); + $fatal(1, "%m %t ERROR: test_0 point was wrong", $time); + end + + $display("test_0 PASSED"); +end +endtask; + +function compare_point(); + +endfunction + +initial begin + out_if.rdy = 0; + in_if.val = 0; + #(40*CLK_PERIOD); + + test_0(); + + #1us $finish(); +end +endmodule \ No newline at end of file diff --git a/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv b/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv index 0373721..6fad739 100644 --- a/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv +++ b/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv @@ -79,14 +79,17 @@ begin integer signed get_len; logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected, get_dat; logic [255:0] in_a, in_b; + integer start_time, finish_time; jb_point_t p_in, p_out; $display("Running test_0..."); p_in = secp256k1_pkg::G_p; k_in = k; + start_time = $time; fork in_if.put_stream(p_in, 256*3/8); out_if.get_stream(get_dat, get_len); join + finish_time = $time; p_out = get_dat; @@ -98,10 +101,11 @@ begin $fatal(1, "%m %t ERROR: test with k=%d was wrong", $time, integer'(k)); end - $display("test with k=%d PASSED", integer'(k)); + $display("test with k=%d PASSED in %d clocks", integer'(k), (finish_time-start_time)/CLK_PERIOD); end endtask; + initial begin out_if.rdy = 0; in_if.val = 0; @@ -117,7 +121,15 @@ initial begin test(3, {x:256'hca90ef9b06d7eb51d650e9145e3083cbd8df8759168862036f97a358f089848, y:256'h435afe76017b8d55d04ff8a98dd60b2ba7eb6f87f6b28182ca4493d7165dd127, - z:256'h9242fa9c0b9f23a3bfea6a0eb6dbcfcbc4853fe9a25ee948105dc66a2a9b5baa}); + z:256'h9242fa9c0b9f23a3bfea6a0eb6dbcfcbc4853fe9a25ee948105dc66a2a9b5baa}); + + test(4, {x:256'h9bae2d5bac61e6ea5de635bca754b2564b7d78c45277cad67e45c4cbbea6e706, + y:256'h34fb8147eed1c0fbe29ead4d6c472eb4ef7b2191fde09e494b2a9845fe3f605e, + z:256'hc327b5d2636b32f27b051e4742b1bbd5324432c1000bfedca4368a29f6654152}); + + test(1514155, {x:256'h759267d17957f567381462db6e240b75c9f6016091a7427cfbef33c398964a9d, + y:256'hd81ce7034647587a9b0ea5b52ac08c91f5cfae30f4eba2ade7fa68856fc0d691, + z:256'h7c9d27fb2de7927c982792630a0c86f411f2de60e8df44c5e9caff976658009c}); #1us $finish(); end