New files for point multiplication

2019-03-20 23:16:13 -04:00 · 2019-03-20 23:16:13 -04:00 · ee603cbf0e
parent 54d09f1744
commit ee603cbf0e
15 changed files with 1165 additions and 104 deletions
--- a/ip_cores/common/src/rtl/common_if.sv
+++ b/ip_cores/common/src/rtl/common_if.sv
@ -97,7 +97,9 @@ interface if_axi_stream # (
  endtask
  // Task used in simulation to drive data on a source interface
-  task automatic put_stream(input logic [common_pkg::MAX_SIM_BYTS*8-1:0] data, input integer signed len);
+  task automatic put_stream(input logic [common_pkg::MAX_SIM_BYTS*8-1:0] data,
                            input integer signed len,
                            input logic [CTL_BITS-1:0] ctl_in = 0);
    logic sop_l=0;
    reset_source();
@ -105,6 +107,7 @@ interface if_axi_stream # (
    while (len > 0) begin
      sop = ~sop_l;
      ctl = ctl_in;
      eop = len - DAT_BYTS <= 0;
      val = 1;
      dat = data;
@ -154,18 +157,29 @@ interface if_axi_mm # (
  input i_clk
 );
-  logic [A_BITS-1:0] raddr;
+  logic [A_BITS-1:0] addr;
-  logic [A_BITS-1:0] waddr;
+  logic [D_BITS-1:0] rd_dat;
-  logic [D_BITS-1:0] rdat;
+  logic [D_BITS-1:0] wr_dat;
-  logic [D_BITS-1:0] wdat;
+  logic              wr;
-  logic              rval;
+  logic              rd;
-  logic              wval;
+  logic              rd_dat_val;
-  logic              rrdy;
+  logic              wait_rq;
  logic              wrdy;
-  modport sink (input raddr, waddr, wdat, wval, rrdy, i_clk, output rdat, rval, wrdy);
+  modport sink (input addr, wr_dat, wr, rd, i_clk, output rd_dat, rd_dat_val, wait_rq, import task reset_sink());
-  modport source (input rdat, rval, wrdy , i_clk, output raddr, waddr, wdat, wval, rrdy);
+  modport source (input rd_dat, rd_dat_val, wait_rq , i_clk, output addr, wr_dat, wr, rd, import task reset_source());
  task reset_source();
    addr <= 0;
    wr_dat <= 0;
    wr <= 0;
    rd <= 0;
  endtask
  task reset_sink();
    rd_dat <= 0;
    rd_dat_val <= 0;
    wait_rq <= 0;
  endtask
 endinterface
--- a/ip_cores/util/src/rtl/accum_mult.sv
+++ b/ip_cores/util/src/rtl/accum_mult.sv
@ -2,6 +2,9 @@
  Accumulating multiplier. Inputs can be of different bit size and the
  level each is accumulated over can be different.
  If using Xilinx FPGA it is best to have a 1.5:1 ratio on BITS,
  since the multiplier is 27x18
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
  This program is free software: you can redistribute it and/or modify
--- a/ip_cores/util/src/rtl/barret_mod.sv
+++ b/ip_cores/util/src/rtl/barret_mod.sv
@ -120,12 +120,13 @@ always_ff @ (posedge i_clk) begin
  end
 end
-//To do the multiplications
+// Do the multiplications
 generate 
  if (MULTIPLIER == "ACCUM_MULT") begin: MULTIPLIER_GEN
    accum_mult # (
      .BITS_A  ( OUT_BITS +8 ),
-      .LEVEL_A ( 4           ) // 32 bit multiply
+      .LEVEL_A ( 6           ), // 32 bit multiply
      .LEVEL_B ( 4           )
    ) 
    accum_mult (
      .i_clk ( i_clk ),
--- a/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv
+++ b/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv
@ -2,7 +2,7 @@
  Multiplication using Karatsuba-Ofman algorithm.
  Multiple of these can be instantiated, each one takes 2 clocks cycles
-  per level.
+  per level. Fully pipelined so can accept a new input every clock.
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
@ -22,11 +22,18 @@
 module karatsuba_ofman_mult # (
  parameter BITS = 256,
  parameter CTL_BITS = 8,
  parameter LEVEL = 1
 ) (
  input                       i_clk,
  input [BITS-1:0]            i_dat_a,
  input [BITS-1:0]            i_dat_b,
  input                       i_val,
  input [CTL_BITS-1:0]        i_ctl,
  input                       i_rdy,
  output logic                o_rdy,
  output logic                o_val,
  output logic [CTL_BITS-1:0] o_ctl,
  output logic [BITS*2-1:0]   o_dat
 );
@ -36,6 +43,8 @@ logic [BITS-1:0] m0, m1, m2;
 logic [BITS*2-1:0] q;
 logic [HBITS-1:0] a0, a1;
 logic sign, sign_;
 logic val;
 logic [CTL_BITS-1:0] ctl;
 generate
  always_comb begin
@ -52,8 +61,12 @@ generate
      m2 = i_dat_a[0 +: HBITS] * i_dat_b[0 +: HBITS];    
      m1 = (a0 * a1);
      sign = sign_;
      o_rdy = i_rdy;
      val = i_val;
      ctl = i_ctl;
    end
  end else begin 
    // pipeline the other non-mult values x clock cycles and add them after multipliers
    logic [LEVEL-2:0] sign_r;
@ -68,34 +81,55 @@ generate
    karatsuba_ofman_mult # (
      .BITS     ( HBITS    ),
      .CTL_BITS ( CTL_BITS ),
      .LEVEL    ( LEVEL-1  )
    )
    karatsuba_ofman_mult_m0 (
      .i_clk   ( i_clk                   ),
      .i_dat_a ( i_dat_a[HBITS +: HBITS] ),
      .i_dat_b ( i_dat_b[HBITS +: HBITS] ),
      .i_val   ( i_val                   ),
      .o_val   ( val                     ),
      .i_ctl   ( i_ctl                   ),
      .o_ctl   ( ctl                     ),
      .i_rdy   ( i_rdy                   ),
      .o_rdy   ( o_rdy                   ),
      .o_dat   ( m0                      )
    );
    karatsuba_ofman_mult # (
      .BITS     ( HBITS   ),
      .CTL_BITS ( 1       ),
      .LEVEL    ( LEVEL-1 )
    )
    karatsuba_ofman_mult_m2 (
      .i_clk   ( i_clk               ),
      .i_dat_a ( i_dat_a[0 +: HBITS] ),
      .i_dat_b ( i_dat_b[0 +: HBITS] ),
      .i_val   ( i_val               ),
      .o_val   (),
      .i_ctl   ( 1'd0                ),
      .o_ctl   (),
      .i_rdy   ( i_rdy               ),
      .o_rdy   (),      
      .o_dat   ( m2                  )
    );
    karatsuba_ofman_mult # (
      .BITS     ( HBITS   ),
      .CTL_BITS ( 1       ),
      .LEVEL    ( LEVEL-1 )
    )
    karatsuba_ofman_mult_m1 (
      .i_clk   ( i_clk ),
      .i_dat_a ( a0    ),
      .i_dat_b ( a1    ),
      .i_val   ( i_val ),
      .o_val   (),
      .i_ctl   ( 1'd0  ),
      .o_ctl   (),
      .i_rdy   ( i_rdy ),
      .o_rdy   (),            
      .o_dat   ( m1    )
    );
@ -105,6 +139,8 @@ endgenerate
 always_ff @ (posedge i_clk) begin
  o_dat <= q;
  o_val <= val;
  o_ctl <= ctl;
 end
 endmodule
--- a/ip_cores/util/src/tb/accum_mult_tb.sv
+++ b/ip_cores/util/src/tb/accum_mult_tb.sv
@ -23,8 +23,8 @@ localparam CLK_PERIOD = 100;
 logic clk, rst;
-if_axi_stream #(.DAT_BYTS(512/8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(66)) in_if(clk);
-if_axi_stream #(.DAT_BYTS(512/8)) out_if(clk);
+if_axi_stream #(.DAT_BYTS(66)) out_if(clk);
 initial begin
  rst = 0;
@ -50,15 +50,15 @@ always_ff @ (posedge clk)
    $error(1, "%m %t ERROR: output .err asserted", $time);
 accum_mult # (
-  .BITS_A  ( 256 ),
+  .BITS_A  ( 264 ),
  .LEVEL_A ( 4   ),
-  .LEVEL_B ( 4   )
+  .LEVEL_B ( 6   )
 ) 
 accum_mult (
  .i_clk ( clk ),
  .i_rst ( rst ),
-  .i_dat_a ( in_if.dat[0 +: 256]   ),
+  .i_dat_a ( in_if.dat[0 +: 264]   ),
-  .i_dat_b ( in_if.dat[256 +: 256] ),
+  .i_dat_b ( in_if.dat[264 +: 264] ),
  .i_val   ( in_if.val             ),
  .o_rdy ( in_if.rdy ),
  .o_dat ( out_if.dat ),
@ -72,20 +72,20 @@ task test_loop();
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
-  logic [255:0] in_a, in_b;
+  logic [263:0] in_a, in_b;
  integer i, max;
-  
+  get_dat = 0;
  $display("Running test_loop...");
  i = 0;
  max = 10000;
  while (i < max) begin
-    in_a = random_vector(256/8);
+    in_a = random_vector(264/8);
-    in_b = random_vector(256/8);
+    in_b = random_vector(264/8);
    expected = (in_a * in_b);
    fork
-      in_if.put_stream({in_b, in_a}, 512/8);
+      in_if.put_stream({in_b, in_a}, 528/8);
      out_if.get_stream(get_dat, get_len);
    join
--- a/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv
+++ b/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv
@ -24,10 +24,9 @@ localparam CLK_PERIOD = 100;
 logic clk, rst;
-if_axi_stream #(.DAT_BYTS(512/8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(512/8), .CTL_BITS(8)) in_if(clk);
-if_axi_stream #(.DAT_BYTS(512/8)) out_if(clk);
+if_axi_stream #(.DAT_BYTS(512/8), .CTL_BITS(8)) out_if(clk);
 logic [511:0] test; 
 initial begin
  rst = 0;
  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
@ -50,33 +49,26 @@ always_ff @ (posedge clk)
  if (out_if.val && out_if.err)
    $error(1, "%m %t ERROR: output .err asserted", $time);
-localparam LEVEL = 3;
+localparam LEVEL = 2;
 logic [LEVEL-1:0] val;
 karatsuba_ofman_mult # (
  .BITS     ( 256   ),
  .CTL_BITS ( 8     ),
  .LEVEL    ( LEVEL )
 )
 karatsuba_ofman_mult (
  .i_clk  ( clk                   ),
  .i_dat_a( in_if.dat[0 +: 256]   ),
  .i_dat_b( in_if.dat[256 +: 256] ),
  .i_val  ( in_if.val  ),
  .o_val  ( out_if.val ),
  .i_ctl  ( in_if.ctl  ),
  .o_ctl  ( out_if.ctl ),
  .i_rdy  ( out_if.rdy ),
  .o_rdy  ( in_if.rdy  ),
  .o_dat  ( out_if.dat )
 );
 always_ff @ (posedge clk) begin
  if (rst) begin
    val <= 0;
  end else begin
    val <= {val, in_if.val};
  end  
 end
 always_comb begin
  out_if.val = val[LEVEL-1];
  in_if.rdy = out_if.rdy;
 end
 task test_loop();
 begin
  integer signed get_len;
@ -94,7 +86,7 @@ begin
    expected = (in_a * in_b);
    fork
-      in_if.put_stream({in_b, in_a}, 512/8);
+      in_if.put_stream({in_b, in_a}, 512/8, i);
      out_if.get_stream(get_dat, get_len);
    join
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_mod.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_mod.sv
@ -24,16 +24,19 @@
 */
 module secp256k1_mod #(
-  parameter USE_MULT = 0   // Set to 1 to use multiple operation (should infer DSP and use less LUTs)
+  parameter USE_MULT = 0,   // Set to 1 to use multiple operation (should infer DSP and use less LUTs)
  parameter CTL_BITS = 8
 )(
  input i_clk, i_rst,
  // Input value
  input [256*2-1:0]    i_dat,
  input                i_val,
  input                i_err,
  input [CTL_BITS-1:0] i_ctl,
  output logic         o_rdy,
  // output
  output logic [255:0]        o_dat,
  output logic [CTL_BITS-1:0] o_ctl,
  input                       i_rdy,
  output logic                o_val,
  output logic                o_err // Will go high if after 1 reduction we are still >= p
@ -43,6 +46,7 @@ import secp256k1_pkg::*;
 logic [256*2-1:0] res0, res1;
 logic [1:0] val, err;
 logic [1:0][CTL_BITS-1:0] ctl;
 generate
  if (USE_MULT == 1) begin: GEN_MULT
@ -74,16 +78,20 @@ always_ff @ (posedge i_clk) begin
    val <= 0;
    err <= 0;
    o_val <= 0;
    ctl <= 0;
    o_err <= 0;
  end else begin
    o_val <= 0;
    val <= val << 1;
    ctl <= {ctl, i_ctl};
    err <= err << 1;
    val[0] <= i_val;
    err[0] <= i_err;
    o_dat <= res1 >= p_eq ? res1 - p_eq : res1;
    o_err <= err[1] || (res1 >= 2*p_eq);
    o_val <= val[1];
    o_ctl <= ctl[1];
  end
 end
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_mult_mod.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_mult_mod.sv
@ -24,16 +24,20 @@
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
-module secp256k1_mult_mod (
+module secp256k1_mult_mod #(
  parameter CTL_BITS = 8
 )(
  input i_clk, i_rst,
  // Input value
  input [255:0]        i_dat_a,
  input [255:0]        i_dat_b,
  input [CTL_BITS-1:0] i_ctl,
  input                i_val,
  input                i_err,
  output logic         o_rdy,
  // output
  output logic [255:0]        o_dat,
  output logic [CTL_BITS-1:0] o_ctl,
  input                       i_rdy,
  output logic                o_val,
  output logic                o_err 
@ -45,51 +49,55 @@ import common_pkg::*;
 localparam KARATSUBA_LEVEL = 2;
 if_axi_stream #(.DAT_BYTS(512/8)) int_if(i_clk);
-always_comb o_rdy = int_if.rdy;
+logic [KARATSUBA_LEVEL-1:0] err;
 logic [KARATSUBA_LEVEL-1:0] val, err;
 karatsuba_ofman_mult # (
  .BITS     ( 256             ),
-  .LEVEL ( KARATSUBA_LEVEL )
+  .LEVEL    ( KARATSUBA_LEVEL ),
  .CTL_BITS ( CTL_BITS        )
 )
 karatsuba_ofman_mult (
  .i_clk  ( i_clk      ),
  .i_ctl  ( i_ctl      ),
  .i_dat_a( i_dat_a    ),
  .i_dat_b( i_dat_b    ),
-  .o_dat  ( int_if.dat )
+  .i_val  ( i_val      ),
  .o_rdy  ( o_rdy      ),
  .o_dat  ( int_if.dat ),
  .o_val  ( int_if.val ),
  .i_rdy  ( int_if.rdy ),
  .o_ctl  ( int_if.ctl )
 );
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    val <= 0;
    err <= 0;
  end else begin
    val <= {val, i_val};
    err <= {err, i_err};
  end
 end
 always_comb begin
  int_if.val = val[KARATSUBA_LEVEL-1];
  int_if.err = err[KARATSUBA_LEVEL-1];
  int_if.mod = 0;
  int_if.sop = 0;
  int_if.eop = 0;
  int_if.ctl = 0;
 end
 secp256k1_mod #(
-  .USE_MULT ( 0 )
+  .USE_MULT ( 0        ),
  .CTL_BITS ( CTL_BITS )
 )
 secp256k1_mod (
  .i_clk( i_clk       ),
  .i_rst( i_rst       ),
  .i_dat( int_if.dat  ),
  .i_val( int_if.val  ),
  .i_ctl( int_if.ctl  ),
  .i_err( int_if.err  ),
  .o_rdy( int_if.rdy  ),
  .o_dat( o_dat ),
  .o_ctl( o_ctl ),
  .o_err( o_err ),
  .i_rdy( i_rdy ),
  .o_val( o_val )
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv
@ -30,4 +30,51 @@ package secp256k1_pkg;
  parameter [255:0] p_eq =  (1 << 256) - (1 << 32) - (1 << 9) - (1 << 8) - (1 << 7) - (1 << 6) - (1 << 4) - 1;
  // Use register map for debug, holds information on current operation
  parameter REGISTER_SIZE = 64;
  // The mapping to index
  parameter CURR_CMD = 0;     // What command are we processing
  parameter CURR_STATE = 1;   // What state are we in
  // If it is processing a signature verification, these bits will be populated:
  parameter SIG_VER_HASH = 8; // 256 bits
  parameter SIG_VER_S = 12;   // 256 bits
  parameter SIG_VER_R = 16;   // 256 bits
  parameter SIG_VER_Q = 20;   // 512 bits
  parameter SIG_VER_W = 28;   // 256 bits - Result of invert(s)
  // Expected to be in Jacobian coordinates
  typedef struct packed {
    logic [255:0] x, y, z;
  } jb_point_t;
  typedef struct packed {
    logic [5:0] padding;
    logic X_INFINITY_POINT;
    logic OUT_OF_RANGE_S;
    logic OUT_OF_RANGE_R;
  } secp256k1_ver_t; 
  function is_zero(jb_point_t p);
    is_zero = (p.x == 0 && p.y == 0 && p.z == 1);
  endfunction
  // Function to double point in Jacobian coordinates (for comparison in testbench)
  // Here a is 0, and we also mod p the result
  function jb_point_t dbl_jb_point(jb_point_t p);
    logic [1023:0] A, B, C, D;
    A = (p.y*p.y) % p_eq;
    B = (4*p.x*A) % p_eq;
    C = (8*A*A) % p_eq;
    D = (3*p.x*p.x) % p_eq;
    dbl_jb_point.x = (D*D - 2*B) % p_eq;
    dbl_jb_point.y = (D*(B-dbl_jb_point.x) - C) % p_eq;
    dbl_jb_point.z = (2*p.y*p.z) % p_eq;
  endfunction
  function print_jb_point(jb_point_t p);
    $display("x:%h", p.x);
    $display("y:%h", p.y);
    $display("z:%h", p.z);
  endfunction
 endpackage
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_dbl.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_dbl.sv
@ -0,0 +1,258 @@
 /*
  This performs point doubling.
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 module secp256k1_point_dbl
  import secp256k1_pkg::*;
 #(
 )(
  input i_clk, i_rst,
  // Input point
  input jb_point_t i_p,
  input logic   i_val,
  output logic  o_rdy,
  // Output point
  output jb_point_t o_p,
  input logic    i_rdy,
  output logic   o_val,
  output logic   o_err,
  // Interface to 256bit multiplier (mod p)
  if_axi_stream.source o_mult_if,
  if_axi_stream.source i_mult_if,
  // Interface to only mod reduction block
  if_axi_stream.source o_mod_if,
  if_axi_stream.source i_mod_if
 );
 /*
 * These are the equations that need to be computed, they are issued as variables
 * become valid. We have a bitmask to track what equation results are valid which
 * will trigger other equations. [] show what equations must be valid before this starts.
 * 
 * 0.    A = (i_p.y)^2 mod p
 * 1.    B = (i_p.x)*A mod p [eq0]
 * 2.    B = 4*B mod p [eq1]
 * 3.    C = A^2 mod p [eq0]
 * 4.    C = C*8 mod p [eq3]
 * 5.    D = (i_p.x)^2 mod p
 * 6.    D = 3*D mod p [eq5]
 * 7.   (o_p.x) = D^2 mod p[eq6]
 * 8.    E = 2*B mod p [eq2]
 * 9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
 * 10   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
 * 11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
 * 12.   (o_p.y) = (o_p.y) - C mod p [eq11]
 * 13.   (o_p.z) = 2*(i_p.y) mod p
 * 14.   (o_p.z) = o_p.y * i_p.z mod p [eq14]
 */
 logic [14:0] eq_val, eq_wait;
 // Temporary variables
 logic [255:0] A, B, C, D, E;
 jb_point_t i_p_l;
 enum {IDLE, START, FINISHED} state;
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    o_val <= 0;
    o_rdy <= 0;
    o_p <= 0;
    o_mult_if.reset_source();
    o_mod_if.reset_source();
    i_mult_if.rdy <= 0;
    i_mod_if.rdy <= 0;
    eq_val <= 0;
    state <= IDLE;
    eq_wait <= 0;
    i_p_l <= 0;
    o_err <= 0;
    A <= 0;
    B <= 0;
    C <= 0;
    D <= 0;
    E <= 0;
  end else begin
    if (o_mult_if.rdy) 
      o_mult_if.val <= 0;
    if (o_mod_if.rdy)
      o_mod_if.val <= 0;
    case(state)
      {IDLE}: begin
        o_rdy <= 1;
        eq_val <= 0;
        eq_wait <= 0;
        o_err <= 0;
        i_mult_if.rdy <= 1;
        i_p_l <= i_p;
        A <= 0;
        B <= 0;
        C <= 0;
        D <= 0;
        E <= 0;
        if (i_val && o_rdy) begin
          state <= START;
          o_rdy <= 0;
          if (i_p.z == 0) begin
            o_err <= 1;
            state <= IDLE;
          end
        end
      end
      // Just a big if tree where we issue equations if the required inputs
      // are valid
      {START}: begin
        i_mod_if.rdy <= 1;
        i_mult_if.rdy <= 1;
        // Check any results from multiplier
        if (i_mod_if.val && i_mod_if.rdy) begin
          eq_val[i_mod_if.ctl] <= 1;
          case(i_mod_if.ctl)
            2: B <= i_mod_if.dat;
            4: C <= i_mod_if.dat;
            8: E <= i_mod_if.dat;
            13: o_p.z <= i_mod_if.dat;
            default: o_err <= 1;
          endcase
        end
        // Check any results from multiplier
        if (i_mult_if.val && i_mult_if.rdy) begin
          eq_val[i_mult_if.ctl] <= 1;
          case(i_mult_if.ctl) inside
            0: A <= i_mult_if.dat;
            1: B <= i_mult_if.dat;
            3: C <= i_mult_if.dat;
            5: D <= i_mult_if.dat;
            6: D <= i_mult_if.dat;
            7: o_p.x <= i_mult_if.dat;
            11: o_p.y <= i_mult_if.dat;
            14: o_p.z <= i_mult_if.dat;
            default: o_err <= 1;
          endcase
        end      
        // Issue new multiplies
        if (~eq_wait[0]) begin              //0.    A = (i_p.y)^2 mod p
          multiply(0, i_p_l.y, i_p_l.y);
        end else
        if (eq_val[0] && ~eq_wait[1]) begin //1.    B = (i_p.x)*A mod p [eq0]
          multiply(1, i_p_l.x, A);
        end else
        if (eq_val[0] && ~eq_wait[3]) begin //3.    C = A^2 mod p [eq0]
          multiply(3, A, A);
        end else
        if (~eq_wait[5]) begin              //5.    D = (i_p.x)^2 mod p
          multiply(5, i_p_l.x, i_p_l.x);
        end else
        if (eq_val[5] && ~eq_wait[6]) begin //6.    D = 3*D mod p [eq5]
          multiply(6, 256'd3, D);
        end else
        if (eq_val[6] && ~eq_wait[7]) begin //7.   (o_p.x) = D^2 mod p[eq6]
          multiply(7, D, D);
        end else
        if (eq_val[10] && eq_val[6] && ~eq_wait[11]) begin //11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
          multiply(11, D, o_p.y);
        end else
        if (eq_val[13] && ~eq_wait[14]) begin //14.   (o_p.z) = o_p.z * i_p.z mod p [eq13]
          multiply(14, i_p_l.z, o_p.z);
        end
        // Issue new modulo reductions
        if (eq_val[1] && ~eq_wait[2]) begin //2.    B = 4*B mod p [eq1]
          modulo(2, B*4);
        end else
        if (eq_val[3] && ~eq_wait[4]) begin //4.    C = C*8 mod p [eq3]
          modulo(4, C*8);
        end else
        if (eq_val[2] && ~eq_wait[8]) begin //8.    E = 2*B mod p [eq2]
          modulo(8, B*2);
        end else
        if (~eq_wait[13]) begin            //13.   (o_p.z) = 2*(i_p.y) mod p
          modulo(13, 2*i_p_l.y);
        end
        // Additions / subtractions we do in-module
        if (eq_val[8] && eq_val[7] && ~eq_wait[9]) begin //9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
          eq_wait[9] <= 1;
          eq_val[9] <= 1;
          o_p.x <= o_p.x + (E > o_p.x ? secp256k1_pkg::p : 0) - E;
        end
        if (eq_val[9] && eq_val[2] && ~eq_wait[10]) begin //10.   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
          eq_wait[10] <= 1;
          eq_val[10] <= 1;
          o_p.y <= B + (o_p.x > B ? secp256k1_pkg::p : 0) - o_p.x;
        end
        if (eq_val[11] && ~eq_wait[12]) begin //12.   (o_p.y) = (o_p.y) - C mod p [eq11]
          eq_wait[12] <= 1;
          eq_val[12] <= 1;
          o_p.y <= o_p.y + (C > o_p.y ? secp256k1_pkg::p : 0) - C;
        end
        if (&eq_val) begin
          state <= FINISHED;
          o_val <= 1;
        end
      end
      {FINISHED}: begin
        if (o_val && i_rdy) begin
          state <= IDLE;
          o_val <= 0;
          o_rdy <= 1;
        end
      end
    endcase
    if (o_err) begin
      o_val <= 1;
      if (o_val && i_rdy) begin
        o_err <= 0;
        state <= IDLE;
      end
    end
  end
 end
 // Task for using multiplies
 task multiply(input int unsigned ctl, input logic [255:0] a, b);
  if (~o_mult_if.val || (o_mult_if.val && o_mult_if.rdy)) begin
    o_mult_if.val <= 1;
    o_mult_if.dat[0 +: 256] <= a;
    o_mult_if.dat[256 +: 256] <= b;
    o_mult_if.ctl <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask
 // Task for using modulo
 task modulo(input int unsigned ctl, input logic [512:0] a);
  if (~o_mod_if.val || (o_mod_if.val && o_mod_if.rdy)) begin
    o_mod_if.val <= 1;
    o_mod_if.dat <= a;
    o_mod_if.ctl <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask
 endmodule
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv
@ -0,0 +1,180 @@
 /*
  This performs point multiplication. We use the standard double
  and add algorithm.
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 module secp256k1_point_mult
  import secp256k1_pkg::*;
 #(
 )(
  input i_clk, i_rst,
  // Input point and value to multiply
  input jb_point_t    i_p,
  input logic [255:0] i_k,
  input logic   i_val,
  output logic  o_rdy,
  // Output point
  output jb_point_t o_p,
  input logic    i_rdy,
  output logic   o_val,
  output logic   o_err
 );
 if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(i_clk);
 if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(i_clk);
 if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mod_in_if(i_clk);
 if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mod_out_if(i_clk);
 logic [255:0] k_l;
 jb_point_t p_n, p_q, p_dbl;
 logic p_dbl_in_val, p_dbl_in_rdy, p_dbl_out_err, p_dbl_out_val, p_dbl_out_rdy;
 enum {IDLE, DOUBLE, ADD, FINISHED} state;
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    o_val <= 0;
    o_err <= 0;
    o_rdy <= 0;
    k_l <= 0;
    p_q <= 0;
    p_dbl_in_val <= 0;
    p_dbl_out_rdy <= 0;
    state <= IDLE;
    o_p <= 0;
    p_n <= 0;
  end else begin
    p_dbl_in_val <= 0;
    p_dbl_out_rdy <= 1;
    case (state)
      {IDLE}: begin
        o_rdy <= 1;
        o_err <= 0;
        p_q <= {x:0, y:0, z:1};  // p_q starts at 0
        if (o_rdy && i_val) begin
          k_l <= i_k;
          p_n <= i_p;
          // Regardless of i_k[0] we skip the first add since it would set p_q to i_p
          if (i_k[0]) begin
            p_q <= i_p;
          end
          state <= DOUBLE;
          p_dbl_in_val <= 1;
        end
      end
      {DOUBLE}: begin
        if(p_dbl_in_val && p_dbl_in_rdy) begin
          p_dbl_in_val <= 0;
        end
        if (p_dbl_out_val && p_dbl_out_rdy) begin
          p_n <= p_dbl;
          k_l <= k_l >> 1;
          if (k_l[1] == 1) begin
            state <= ADD;
          end else if (k_l[255:1] == 0) begin
            state <= FINISHED;
            o_p <= p_dbl;
            o_val <= 1;
          end else begin
            state <= DOUBLE;
            p_dbl_in_val <= 1;
          end        
        end
      end
      {ADD}: begin
        state <= DOUBLE;
        p_q <= p_n;
        p_dbl_in_val <= 1;
      end
      {FINISHED}: begin
        if (i_rdy && o_val) begin
          o_val <= 0;
          state <= IDLE;
        end
      end      
    endcase
    if (p_dbl_out_err) begin
      o_err <= 1;
      o_val <= 1;
      state <= FINISHED;
    end  
  end  
 end
 secp256k1_point_dbl secp256k1_point_dbl(
  .i_clk ( i_clk ),
  .i_rst ( i_rst ),
  // Input point
  .i_p   ( p_n           ),
  .i_val ( p_dbl_in_val  ),
  .o_rdy ( p_dbl_in_rdy  ),
  // Output point
  .o_p   ( p_dbl         ),
  .o_err ( p_dbl_out_err ),
  .i_rdy ( p_dbl_out_rdy ),
  .o_val ( p_dbl_out_val ),
  // Interfaces to shared multipliers / modulo blocks
  .o_mult_if ( mult_in_if  ),
  .i_mult_if ( mult_out_if ),
  .o_mod_if  ( mod_in_if   ),
  .i_mod_if  ( mod_out_if  )
 );
 secp256k1_mult_mod #(
  .CTL_BITS ( 8 )
 )
 secp256k1_mult_mod (
  .i_clk ( i_clk ),
  .i_rst ( i_rst ),
  .i_dat_a ( mult_in_if.dat[0 +: 256] ),
  .i_dat_b ( mult_in_if.dat[256 +: 256] ),
  .i_val ( mult_in_if.val ),
  .i_err ( mult_in_if.err ),
  .i_ctl ( mult_in_if.ctl ),
  .o_rdy ( mult_in_if.rdy ),
  .o_dat ( mult_out_if.dat ),
  .i_rdy ( mult_out_if.rdy ),
  .o_val ( mult_out_if.val ),
  .o_ctl ( mult_out_if.ctl ),
  .o_err ( mult_out_if.err ) 
 );
 secp256k1_mod #(
  .USE_MULT ( 0 ),
  .CTL_BITS ( 8 )
 )
 secp256k1_mod (
  .i_clk( i_clk     ),
  .i_rst( i_rst     ),
  .i_dat( mod_in_if.dat  ),
  .i_val( mod_in_if.val  ),
  .i_err( mod_in_if.err  ),
  .i_ctl( mod_in_if.ctl  ),
  .o_rdy( mod_in_if.rdy  ),
  .o_dat( mod_out_if.dat ),
  .o_ctl( mod_out_if.ctl ),
  .o_err( mod_out_if.err ),
  .i_rdy( mod_out_if.rdy ),
  .o_val( mod_out_if.val )
 );
 endmodule
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_top.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_top.sv
@ -1,12 +1,241 @@
 module secp256k1_top (
  input          i_clk,
  input          i_rst,
-  input                i_val,
+  // Command interface
-  output logic         o_rdy,
+  if_axi_stream.sink if_cmd_rx,
-  output logic         o_val
+  if_axi_stream.source if_cmd_tx,
-  
+  // Memory map interface for debug
-  
+  if_axi_mm.sink if_axi_mm        
 );
 import secp256k1_pkg::*;
 import zcash_fpga_pkg::*;
 // Register map is used for storing command data
 logic [REGISTER_SIZE/64-1:0][63:0] register_map;
 if_ram #(.RAM_WIDTH(64), .RAM_DEPTH(REGISTER_SIZE)) register_file_a (i_clk, i_rst);
 if_ram #(.RAM_WIDTH(64), .RAM_DEPTH(REGISTER_SIZE)) register_file_b (i_clk, i_rst);
 // 256 multiplier (karatsuba)
 logic [255:0] mult_dat_a, mult_dat_b;
 logic mult_dat_val;
 if_axi_stream #(.DAT_BYTS(512/8)) mult_out_if(i_clk);
 // 256 bit inverse calculation
 if_axi_stream #(.DAT_BYTS(256/8)) bin_inv_in_if(i_clk);
 if_axi_stream #(.DAT_BYTS(256/8)) bin_inv_out_if(i_clk);
 // TODO just have one multiplier (unless doulbe & add is parallel)
 //one multiplier that barret reduction can share?
 // Can avoid final inverstion converting from projected coord by some check in c++ code
 // Controlling state machine
 typedef enum {IDLE,
              VERIFY_SECP256K1_SIG_PARSE,           // Parse inputs
              CALC_S_INV,
              POINT_DBL,
              POINT_ADD,
              IGNORE,
              FINISHED} secp256k1_state_t;
 secp256k1_state_t secp256k1_state;
 header_t header, header_l;
 secp256k1_ver_t secp256k1_ver;
 // Other temporary values
 logic [255:0] r, w;
 logic [5:0] cnt; // Counter for parsing command inputs
 logic if_axi_mm_rd;
 always_comb begin
  header = if_cmd_rx.dat;
 end
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    secp256k1_state <= IDLE;
    if_cmd_tx.reset_source();
    if_cmd_rx.reset_sink();
    cnt <= 0;
    mult_out_if.rdy <= 0;
    register_file_a.reset_source();
    mult_dat_a <= 0;
    mult_dat_b <= 0;
    mult_dat_val <= 0;
    w <= 0;
    r <= 0;
    bin_inv_in_if.reset_source();
    bin_inv_out_if.rdy <= 0;
    secp256k1_ver <= 0;
  end else begin
    register_file_a.en <= 1;
    register_file_a.wr <= 0;
    register_file_a.rd <= 1;
    mult_out_if.rdy <= 1;
    bin_inv_out_if.rdy <= 1;
    mult_dat_val <= 0;
    case(secp256k1_state)
      {IDLE}: begin
        secp256k1_ver <= 0;
        if_cmd_rx.rdy <= 1;
        header_l <= header;
        cnt <= 0;
        if (if_cmd_rx.val && if_cmd_rx.rdy) begin
          case(header.cmd)
            {VERIFY_SECP256K1_SIG}: begin
              register_map[CURR_CMD] <= header;
              secp256k1_state <= VERIFY_SECP256K1_SIG_PARSE;
            end
            default: begin
              if (~if_cmd_rx.eop) begin
                if_cmd_rx.rdy <= 1;
                secp256k1_state <= IGNORE;
              end
            end
          endcase
        end
      end
      {VERIFY_SECP256K1_SIG_PARSE}: begin
        if_cmd_rx.rdy <= 1;
        if (if_cmd_rx.val && if_cmd_rx.rdy) begin
          register_file_a.wr <= 1;
          cnt <= cnt + 1;
          if (cnt == 19) secp256k1_state <= CALC_S_INV;
        end
        if (bin_inv_in_if.val && bin_inv_in_if.rdy)
          bin_inv_in_if.val <= 0;
        case(cnt) inside
          [0:3]: begin
            register_file_a.a <= SIG_VER_S + (cnt % 4);
            register_file_a.d <= if_cmd_rx.dat;
            // Can start calculating the inverse here
            bin_inv_in_if.dat[(cnt % 4)*64 +: 64] <= if_cmd_rx.dat;
            if (cnt == 3) begin
              bin_inv_in_if.val <= 1;
            end
            end
          [4:7]: begin
            // We can load R into the karatsuba multiplier
            register_file_a.a <= SIG_VER_R + (cnt % 4);
            register_file_a.d <= if_cmd_rx.dat;
            mult_dat_a[(cnt % 4)*64 +: 64] <= if_cmd_rx.dat;
          end
          [8:11]: begin
            register_file_a.a <= SIG_VER_HASH + (cnt % 4);
            register_file_a.d <= if_cmd_rx.dat;
          end
          [12:19]: begin
            register_file_a.a <= SIG_VER_Q + (cnt % 8);
            register_file_a.d <= if_cmd_rx.dat;
          end
        endcase
      end
      {CALC_S_INV}: begin
        // Wait until bin_inv_out_if.val
        if (bin_inv_in_if.dat >= secp256k1_pkg::n) secp256k1_ver.OUT_OF_RANGE_S <= 1;
        if (mult_dat_a >= secp256k1_pkg::n) secp256k1_ver.OUT_OF_RANGE_R <= 1;
        if (bin_inv_out.val && bin_inv_out.rdy) begin
          w <= bin_inv_out.dat;
          // TODO also write this to RAM
          // need to do 2 multiplications % n to get u1 and u2
        end
      end
      {IGNORE}: begin
        if_cmd_rx.rdy <= 1;
        if (if_cmd_rx.rdy && if_cmd_rx.val && if_cmd_rx.eop)
          secp256k1_state <= IDLE;
      end
    endcase
  end
 end
 // TODO could provide write access
 always_comb begin
 end
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    if_axi_mm.reset_sink();
    register_file_b.reset_source();
  end else begin
    if_axi_mm.rd_dat_val <= 0;         
    register_file_b.en <= 1;
    register_file_b.rd <= 1;
    register_file_b.a <= if_axi_mm.addr/8;
    if_axi_mm_rd <= if_axi_mm.rd;
    if (if_axi_mm_rd) begin
      if_axi_mm.rd_dat_val <= 1;
      if_axi_mm.rd_dat <= register_file_b.q;    
    end
  end
 end
 // BRAM for storing parsed inputs
 bram #(
  .RAM_WIDTH       ( 64                 ),
  .RAM_DEPTH       ( REGISTER_SIZE      ),
  .RAM_PERFORMANCE ( "HIGH_PERFORMANCE" )
 ) register_file (
  .a ( register_file_a ),
  .b ( register_file_b )
 );
 // Calculate binary inverse mod n
 begin: BINARY_INVERSE_MOD_N
  bin_inv #(
    .BITS ( 256              ),
    .P    ( secp256k1_pkg::n )
  )(
    .i_clk ( i_clk ),
    .i_rst ( i_rst) ,
    .i_dat ( bin_inv_in_if.dat ),
    .i_val ( bin_inv_in_if.val ),
    .o_rdy ( bin_inv_in_if.rdy ),
    .o_dat ( bin_inv_out_if.dat ),
    .o_val ( bin_inv_out_if.val ),
    .i_rdy ( bin_inv_out_if.rdy )
  );
 end
 // 256 bit Karatsuba_ofman multiplier
 begin: KARATSUBA_OFMAN_MULT
  localparam KARATSUBA_LEVEL = 2;
  logic [KARATSUBA_LEVEL-1:0] val;
  karatsuba_ofman_mult # (
    .BITS  ( 256             ),
    .LEVEL ( KARATSUBA_LEVEL )
  )
  karatsuba_ofman_mult (
    .i_clk  ( i_clk           ),
    .i_dat_a( mult_dat_a      ),
    .i_dat_b( mult_dat_b      ),  
    .o_dat  ( mult_out_if.dat )
  );
  always_ff @ (posedge i_clk) begin
    if (i_rst) begin
      mult_out_if.val <= 0;
    end else begin
      val <= {val, mult_dat_val};
    end
  end
 end
 // Modulo p reducer (shared with arbitrator)
 // Modulo n reducer (output from karatsuba multiplier)
 // 256 bit Karatsuba_ofman multiplier (shared with arbitrator)
 // Point double module or Point multiply module
 endmodule
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@ -24,7 +24,7 @@ package zcash_fpga_pkg;
  import equihash_pkg::N;
  import equihash_pkg::K;
-  parameter FPGA_VERSION = 32'h0;
+  parameter FPGA_VERSION = 32'h01_00_00;  //v1.0.0
  localparam [63:0] FPGA_CMD_CAP = {{62'd0},
                                    (equihash_pkg::N == 144 && equihash_pkg::K == 5),       // N = 144, K = 5 for VERIFY_EQUIHASH command
                                    (equihash_pkg::N == 200 && equihash_pkg::K == 9)};      // N = 200, K = 9 for VERIFY_EQUIHASH command
@ -36,6 +36,7 @@ package zcash_fpga_pkg;
    RESET_FPGA            = 'h0000_00_00,
    FPGA_STATUS           = 'h0000_00_01,
    VERIFY_EQUIHASH       = 'h0000_01_00,
    VERIFY_SECP256K1_SIG  = 'h0000_01_01,
    // Replies from the FPGA
    RESET_FPGA_RPL      = 'h80_00_00_00,
--- a/zcash_fpga/src/tb/secp256k1_point_dbl_tb.sv
+++ b/zcash_fpga/src/tb/secp256k1_point_dbl_tb.sv
@ -0,0 +1,165 @@
 /*
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 `timescale 1ps/1ps
 module secp256k1_point_dbl_tb ();
 import common_pkg::*;
 import secp256k1_pkg::*;
 localparam CLK_PERIOD = 100;
 logic clk, rst;
 if_axi_stream #(.DAT_BYTS(256*3/8)) in_if(clk); // Point is X, Y, Z
 if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk);
 if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(clk);
 if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(clk);
 if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mod_in_if(clk);
 if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mod_out_if(clk);
 jb_point_t in_p, out_p;
 always_comb begin
  in_p = in_if.dat; 
  out_if.dat = out_p;
 end
 initial begin
  rst = 0;
  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
 end
 initial begin
  clk = 0;
  forever #CLK_PERIOD clk = ~clk;
 end
 always_comb begin
  out_if.sop = 1;
  out_if.eop = 1;
  out_if.ctl = 0;
  out_if.mod = 0;
 end
 // Check for errors
 always_ff @ (posedge clk)
  if (out_if.val && out_if.err)
    $error(1, "%m %t ERROR: output .err asserted", $time);
 secp256k1_point_dbl secp256k1_point_dbl(
  .i_clk ( clk ),
  .i_rst ( rst ),
    // Input point
  .i_p   ( in_p      ),
  .i_val ( in_if.val ),
  .o_rdy ( in_if.rdy ),
  .o_p   ( out_p     ),
  .o_err ( out_if.err ),
  .i_rdy ( out_if.rdy ),
  .o_val  ( out_if.val ) ,
  .o_mult_if ( mult_in_if ),
  .i_mult_if ( mult_out_if ),
  .o_mod_if ( mod_in_if ),
  .i_mod_if ( mod_out_if )
 );
 // Attach a mod reduction unit and multiply - mod unit
 // In full design these could use dedicated multipliers or be arbitrated
 secp256k1_mult_mod #(
  .CTL_BITS ( 8 )
 )
 secp256k1_mult_mod (
  .i_clk ( clk ),
  .i_rst ( rst ),
  .i_dat_a ( mult_in_if.dat[0 +: 256] ),
  .i_dat_b ( mult_in_if.dat[256 +: 256] ),
  .i_val ( mult_in_if.val ),
  .i_err ( mult_in_if.err ),
  .i_ctl ( mult_in_if.ctl ),
  .o_rdy ( mult_in_if.rdy ),
  .o_dat ( mult_out_if.dat ),
  .i_rdy ( mult_out_if.rdy ),
  .o_val ( mult_out_if.val ),
  .o_ctl ( mult_out_if.ctl ),
  .o_err ( mult_out_if.err ) 
 );
 secp256k1_mod #(
  .USE_MULT ( 0 ),
  .CTL_BITS ( 8 )
 )
 secp256k1_mod (
  .i_clk( clk       ),
  .i_rst( rst       ),
  .i_dat( mod_in_if.dat  ),
  .i_val( mod_in_if.val  ),
  .i_err( mod_in_if.err  ),
  .i_ctl( mod_in_if.ctl  ),
  .o_rdy( mod_in_if.rdy  ),
  .o_dat( mod_out_if.dat ),
  .o_ctl( mod_out_if.ctl ),
  .o_err( mod_out_if.err ),
  .i_rdy( mod_out_if.rdy ),
  .o_val( mod_out_if.val )
 );
 task test_0();
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
  logic [255:0] in_a, in_b;
  jb_point_t p_in, p_exp, p_out;
  $display("Running test_0...");
  p_in = {z:1, x:2, y:3};
  p_exp = dbl_jb_point(p_in);
  fork
    in_if.put_stream(p_in, 256*3/8);
    out_if.get_stream(get_dat, get_len);
  join
  p_out = get_dat;
  if (p_exp != p_out) begin
    $display("Expected:");
    print_jb_point(p_exp);
    $display("Was:");
    print_jb_point(p_out);
    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
  end 
  $display("test_0 PASSED");
 end
 endtask;
 function compare_point();
 endfunction
 initial begin
  out_if.rdy = 0;
  in_if.val = 0;
  #(40*CLK_PERIOD);
  test_0();
  #1us $finish();
 end
 endmodule
--- a/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv
+++ b/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv
@ -0,0 +1,119 @@
 /*
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 `timescale 1ps/1ps
 module secp256k1_point_mult_tb ();
 import common_pkg::*;
 import secp256k1_pkg::*;
 localparam CLK_PERIOD = 100;
 logic clk, rst;
 if_axi_stream #(.DAT_BYTS(256*3/8)) in_if(clk);
 if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk);
 jb_point_t in_p, out_p;
 logic [255:0] k;
 always_comb begin
  in_p = in_if.dat; 
  out_if.dat = out_p;
 end
 initial begin
  rst = 0;
  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
 end
 initial begin
  clk = 0;
  forever #CLK_PERIOD clk = ~clk;
 end
 always_comb begin
  out_if.sop = 1;
  out_if.eop = 1;
  out_if.ctl = 0;
  out_if.mod = 0;
 end
 // Check for errors
 always_ff @ (posedge clk)
  if (out_if.val && out_if.err) begin
    out_if.rdy = 1;
    $error(1, "%m %t ERROR: output .err asserted", $time);
  end
 secp256k1_point_mult secp256k1_point_mult (
  .i_clk ( clk ),
  .i_rst ( rst ),
  .i_p   ( in_if.dat  ),
  .i_k   ( k          ),
  .i_val ( in_if.val  ),
  .o_rdy ( in_if.rdy  ),
  .o_p   ( out_p      ),
  .i_rdy ( out_if.rdy ),
  .o_val ( out_if.val ),
  .o_err ( out_if.err )
 );
 task test_0();
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
  logic [255:0] in_a, in_b;
  jb_point_t p_in, p_exp, p_out;
  $display("Running test_0...");
  p_in = {z:1, x:2, y:3};
  k = 100;
  //p_exp = dbl_jb_point(p_in);
  fork
    in_if.put_stream(p_in, 256*3/8);
    out_if.get_stream(get_dat, get_len);
  join
  /*p_out = get_dat;
  if (p_exp != p_out) begin
    $display("Expected:");
    print_jb_point(p_exp);
    $display("Was:");
    print_jb_point(p_out);
    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
  end */
  $display("test_0 PASSED");
 end
 endtask;
 function compare_point();
 endfunction
 initial begin
  out_if.rdy = 0;
  in_if.val = 0;
  #(40*CLK_PERIOD);
  test_0();
  #1us $finish();
 end
 endmodule