New files for point multiplication

2019-03-20 23:16:13 -04:00 · 2019-03-20 23:16:13 -04:00 · ee603cbf0e
parent 54d09f1744
commit ee603cbf0e
15 changed files with 1165 additions and 104 deletions
--- a/ip_cores/common/src/rtl/common_if.sv
+++ b/ip_cores/common/src/rtl/common_if.sv
@ -97,7 +97,9 @@ interface if_axi_stream # (
  endtask
    
  // Task used in simulation to drive data on a source interface
-  task automatic put_stream(input logic [common_pkg::MAX_SIM_BYTS*8-1:0] data, input integer signed len);
+  task automatic put_stream(input logic [common_pkg::MAX_SIM_BYTS*8-1:0] data,
+                            input integer signed len,
+                            input logic [CTL_BITS-1:0] ctl_in = 0);
    logic sop_l=0;
    
    reset_source();
@ -105,6 +107,7 @@ interface if_axi_stream # (
    
    while (len > 0) begin
      sop = ~sop_l;
+      ctl = ctl_in;
      eop = len - DAT_BYTS <= 0;
      val = 1;
      dat = data;
@ -154,18 +157,29 @@ interface if_axi_mm # (
  input i_clk
 );
  
-  logic [A_BITS-1:0] raddr;
-  logic [A_BITS-1:0] waddr;
-  logic [D_BITS-1:0] rdat;
-  logic [D_BITS-1:0] wdat;
-  logic              rval;
-  logic              wval;
-  logic              rrdy;
-  logic              wrdy;
+  logic [A_BITS-1:0] addr;
+  logic [D_BITS-1:0] rd_dat;
+  logic [D_BITS-1:0] wr_dat;
+  logic              wr;
+  logic              rd;
+  logic              rd_dat_val;
+  logic              wait_rq;
  
-  modport sink (input raddr, waddr, wdat, wval, rrdy, i_clk, output rdat, rval, wrdy);
-  modport source (input rdat, rval, wrdy , i_clk, output raddr, waddr, wdat, wval, rrdy);
+  modport sink (input addr, wr_dat, wr, rd, i_clk, output rd_dat, rd_dat_val, wait_rq, import task reset_sink());
+  modport source (input rd_dat, rd_dat_val, wait_rq , i_clk, output addr, wr_dat, wr, rd, import task reset_source());
 
+  task reset_source();
+    addr <= 0;
+    wr_dat <= 0;
+    wr <= 0;
+    rd <= 0;
+  endtask
+  
+  task reset_sink();
+    rd_dat <= 0;
+    rd_dat_val <= 0;
+    wait_rq <= 0;
+  endtask
    
 endinterface

--- a/ip_cores/util/src/rtl/accum_mult.sv
+++ b/ip_cores/util/src/rtl/accum_mult.sv
@ -2,6 +2,9 @@
  Accumulating multiplier. Inputs can be of different bit size and the
  level each is accumulated over can be different.
  
+  If using Xilinx FPGA it is best to have a 1.5:1 ratio on BITS,
+  since the multiplier is 27x18
+  
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation

  This program is free software: you can redistribute it and/or modify
--- a/ip_cores/util/src/rtl/barret_mod.sv
+++ b/ip_cores/util/src/rtl/barret_mod.sv
@ -120,12 +120,13 @@ always_ff @ (posedge i_clk) begin
  end
 end

-//To do the multiplications
+// Do the multiplications
 generate 
  if (MULTIPLIER == "ACCUM_MULT") begin: MULTIPLIER_GEN
    accum_mult # (
      .BITS_A  ( OUT_BITS +8 ),
-      .LEVEL_A ( 4           ) // 32 bit multiply
+      .LEVEL_A ( 6           ), // 32 bit multiply
+      .LEVEL_B ( 4           )
    ) 
    accum_mult (
      .i_clk ( i_clk ),
--- a/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv
+++ b/ip_cores/util/src/rtl/karatsuba_ofman_mult.sv
@ -2,7 +2,7 @@
  Multiplication using Karatsuba-Ofman algorithm.
  
  Multiple of these can be instantiated, each one takes 2 clocks cycles
-  per level.
+  per level. Fully pipelined so can accept a new input every clock.
  
  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation

@ -22,11 +22,18 @@

 module karatsuba_ofman_mult # (
  parameter BITS = 256,
+  parameter CTL_BITS = 8,
  parameter LEVEL = 1
 ) (
  input                       i_clk,
  input [BITS-1:0]            i_dat_a,
  input [BITS-1:0]            i_dat_b,
+  input                       i_val,
+  input [CTL_BITS-1:0]        i_ctl,
+  input                       i_rdy,
+  output logic                o_rdy,
+  output logic                o_val,
+  output logic [CTL_BITS-1:0] o_ctl,
  output logic [BITS*2-1:0]   o_dat
 );

@ -36,6 +43,8 @@ logic [BITS-1:0] m0, m1, m2;
 logic [BITS*2-1:0] q;
 logic [HBITS-1:0] a0, a1;
 logic sign, sign_;
+logic val;
+logic [CTL_BITS-1:0] ctl;

 generate
  always_comb begin
@ -52,8 +61,12 @@ generate
      m2 = i_dat_a[0 +: HBITS] * i_dat_b[0 +: HBITS];    
      m1 = (a0 * a1);
      sign = sign_;
+      o_rdy = i_rdy;
+      val = i_val;
+      ctl = i_ctl;
    end
    
+
  end else begin 
    // pipeline the other non-mult values x clock cycles and add them after multipliers
    logic [LEVEL-2:0] sign_r;
@ -68,34 +81,55 @@ generate
    
    karatsuba_ofman_mult # (
      .BITS     ( HBITS    ),
+      .CTL_BITS ( CTL_BITS ),
      .LEVEL    ( LEVEL-1  )
    )
    karatsuba_ofman_mult_m0 (
      .i_clk   ( i_clk                   ),
      .i_dat_a ( i_dat_a[HBITS +: HBITS] ),
      .i_dat_b ( i_dat_b[HBITS +: HBITS] ),
+      .i_val   ( i_val                   ),
+      .o_val   ( val                     ),
+      .i_ctl   ( i_ctl                   ),
+      .o_ctl   ( ctl                     ),
+      .i_rdy   ( i_rdy                   ),
+      .o_rdy   ( o_rdy                   ),
      .o_dat   ( m0                      )
    );
    
    karatsuba_ofman_mult # (
      .BITS     ( HBITS   ),
+      .CTL_BITS ( 1       ),
      .LEVEL    ( LEVEL-1 )
    )
    karatsuba_ofman_mult_m2 (
      .i_clk   ( i_clk               ),
      .i_dat_a ( i_dat_a[0 +: HBITS] ),
      .i_dat_b ( i_dat_b[0 +: HBITS] ),
+      .i_val   ( i_val               ),
+      .o_val   (),
+      .i_ctl   ( 1'd0                ),
+      .o_ctl   (),
+      .i_rdy   ( i_rdy               ),
+      .o_rdy   (),      
      .o_dat   ( m2                  )
    );
    
    karatsuba_ofman_mult # (
      .BITS     ( HBITS   ),
+      .CTL_BITS ( 1       ),
      .LEVEL    ( LEVEL-1 )
    )
    karatsuba_ofman_mult_m1 (
      .i_clk   ( i_clk ),
      .i_dat_a ( a0    ),
      .i_dat_b ( a1    ),
+      .i_val   ( i_val ),
+      .o_val   (),
+      .i_ctl   ( 1'd0  ),
+      .o_ctl   (),
+      .i_rdy   ( i_rdy ),
+      .o_rdy   (),            
      .o_dat   ( m1    )
    );
    
@ -105,6 +139,8 @@ endgenerate

 always_ff @ (posedge i_clk) begin
  o_dat <= q;
+  o_val <= val;
+  o_ctl <= ctl;
 end

 endmodule
--- a/ip_cores/util/src/tb/accum_mult_tb.sv
+++ b/ip_cores/util/src/tb/accum_mult_tb.sv
@ -23,8 +23,8 @@ localparam CLK_PERIOD = 100;

 logic clk, rst;

-if_axi_stream #(.DAT_BYTS(512/8)) in_if(clk);
-if_axi_stream #(.DAT_BYTS(512/8)) out_if(clk);
+if_axi_stream #(.DAT_BYTS(66)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(66)) out_if(clk);

 initial begin
  rst = 0;
@ -50,15 +50,15 @@ always_ff @ (posedge clk)
    $error(1, "%m %t ERROR: output .err asserted", $time);

 accum_mult # (
-  .BITS_A  ( 256 ),
+  .BITS_A  ( 264 ),
  .LEVEL_A ( 4   ),
-  .LEVEL_B ( 4   )
+  .LEVEL_B ( 6   )
 ) 
 accum_mult (
  .i_clk ( clk ),
  .i_rst ( rst ),
-  .i_dat_a ( in_if.dat[0 +: 256]   ),
-  .i_dat_b ( in_if.dat[256 +: 256] ),
+  .i_dat_a ( in_if.dat[0 +: 264]   ),
+  .i_dat_b ( in_if.dat[264 +: 264] ),
  .i_val   ( in_if.val             ),
  .o_rdy ( in_if.rdy ),
  .o_dat ( out_if.dat ),
@ -72,20 +72,20 @@ task test_loop();
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
-  logic [255:0] in_a, in_b;
+  logic [263:0] in_a, in_b;
  integer i, max;
-  
+  get_dat = 0;
  $display("Running test_loop...");
  i = 0;
  max = 10000;
  
  while (i < max) begin
-    in_a = random_vector(256/8);
-    in_b = random_vector(256/8);
+    in_a = random_vector(264/8);
+    in_b = random_vector(264/8);
    expected = (in_a * in_b);
    
    fork
-      in_if.put_stream({in_b, in_a}, 512/8);
+      in_if.put_stream({in_b, in_a}, 528/8);
      out_if.get_stream(get_dat, get_len);
    join
    
--- a/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv
+++ b/ip_cores/util/src/tb/karatsuba_ofman_mult_tb.sv
@ -24,10 +24,9 @@ localparam CLK_PERIOD = 100;

 logic clk, rst;

-if_axi_stream #(.DAT_BYTS(512/8)) in_if(clk);
-if_axi_stream #(.DAT_BYTS(512/8)) out_if(clk);
+if_axi_stream #(.DAT_BYTS(512/8), .CTL_BITS(8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(512/8), .CTL_BITS(8)) out_if(clk);

-logic [511:0] test; 
 initial begin
  rst = 0;
  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
@ -50,33 +49,26 @@ always_ff @ (posedge clk)
  if (out_if.val && out_if.err)
    $error(1, "%m %t ERROR: output .err asserted", $time);

-localparam LEVEL = 3;
-logic [LEVEL-1:0] val;
-
+localparam LEVEL = 2;
 karatsuba_ofman_mult # (
  .BITS     ( 256   ),
+  .CTL_BITS ( 8     ),
  .LEVEL    ( LEVEL )
 )
 karatsuba_ofman_mult (
  .i_clk  ( clk                   ),
  .i_dat_a( in_if.dat[0 +: 256]   ),
  .i_dat_b( in_if.dat[256 +: 256] ),
+  .i_val  ( in_if.val  ),
+  .o_val  ( out_if.val ),
+  .i_ctl  ( in_if.ctl  ),
+  .o_ctl  ( out_if.ctl ),
+  .i_rdy  ( out_if.rdy ),
+  .o_rdy  ( in_if.rdy  ),
+  
  .o_dat  ( out_if.dat )
 );

-always_ff @ (posedge clk) begin
-  if (rst) begin
-    val <= 0;
-  end else begin
-    val <= {val, in_if.val};
-  end  
-end
-
-always_comb begin
-  out_if.val = val[LEVEL-1];
-  in_if.rdy = out_if.rdy;
-end
-
 task test_loop();
 begin
  integer signed get_len;
@ -94,7 +86,7 @@ begin
    expected = (in_a * in_b);
    
    fork
-      in_if.put_stream({in_b, in_a}, 512/8);
+      in_if.put_stream({in_b, in_a}, 512/8, i);
      out_if.get_stream(get_dat, get_len);
    join
  
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_mod.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_mod.sv
@ -24,16 +24,19 @@
 */

 module secp256k1_mod #(
-  parameter USE_MULT = 0   // Set to 1 to use multiple operation (should infer DSP and use less LUTs)
+  parameter USE_MULT = 0,   // Set to 1 to use multiple operation (should infer DSP and use less LUTs)
+  parameter CTL_BITS = 8
 )(
  input i_clk, i_rst,
  // Input value
  input [256*2-1:0]    i_dat,
  input                i_val,
  input                i_err,
+  input [CTL_BITS-1:0] i_ctl,
  output logic         o_rdy,
  // output
  output logic [255:0]        o_dat,
+  output logic [CTL_BITS-1:0] o_ctl,
  input                       i_rdy,
  output logic                o_val,
  output logic                o_err // Will go high if after 1 reduction we are still >= p
@ -43,6 +46,7 @@ import secp256k1_pkg::*;
  
 logic [256*2-1:0] res0, res1;
 logic [1:0] val, err;
+logic [1:0][CTL_BITS-1:0] ctl;

 generate
  if (USE_MULT == 1) begin: GEN_MULT
@ -74,16 +78,20 @@ always_ff @ (posedge i_clk) begin
    val <= 0;
    err <= 0;
    o_val <= 0;
+    ctl <= 0;
    o_err <= 0;
  end else begin
    o_val <= 0;
    val <= val << 1;
+    ctl <= {ctl, i_ctl};
    err <= err << 1;
    val[0] <= i_val;
    err[0] <= i_err;
+  
    o_dat <= res1 >= p_eq ? res1 - p_eq : res1;
    o_err <= err[1] || (res1 >= 2*p_eq);
    o_val <= val[1];
+    o_ctl <= ctl[1];
  end
 end

--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_mult_mod.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_mult_mod.sv
@ -24,16 +24,20 @@
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

-module secp256k1_mult_mod (
+module secp256k1_mult_mod #(
+  parameter CTL_BITS = 8
+)(
  input i_clk, i_rst,
  // Input value
  input [255:0]        i_dat_a,
  input [255:0]        i_dat_b,
+  input [CTL_BITS-1:0] i_ctl,
  input                i_val,
  input                i_err,
  output logic         o_rdy,
  // output
  output logic [255:0]        o_dat,
+  output logic [CTL_BITS-1:0] o_ctl,
  input                       i_rdy,
  output logic                o_val,
  output logic                o_err 
@ -45,51 +49,55 @@ import common_pkg::*;
 localparam KARATSUBA_LEVEL = 2;
 if_axi_stream #(.DAT_BYTS(512/8)) int_if(i_clk);

-always_comb o_rdy = int_if.rdy;
-
-logic [KARATSUBA_LEVEL-1:0] val, err;
+logic [KARATSUBA_LEVEL-1:0] err;

 karatsuba_ofman_mult # (
  .BITS     ( 256             ),
-  .LEVEL ( KARATSUBA_LEVEL )
+  .LEVEL    ( KARATSUBA_LEVEL ),
+  .CTL_BITS ( CTL_BITS        )
 )
 karatsuba_ofman_mult (
  .i_clk  ( i_clk      ),
+  .i_ctl  ( i_ctl      ),
  .i_dat_a( i_dat_a    ),
  .i_dat_b( i_dat_b    ),
-  .o_dat  ( int_if.dat )
+  .i_val  ( i_val      ),
+  .o_rdy  ( o_rdy      ),
+  .o_dat  ( int_if.dat ),
+  .o_val  ( int_if.val ),
+  .i_rdy  ( int_if.rdy ),
+  .o_ctl  ( int_if.ctl )
 );
  
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
-    val <= 0;
    err <= 0;
  end else begin
-    val <= {val, i_val};
    err <= {err, i_err};
  end
 end

 always_comb begin
-  int_if.val = val[KARATSUBA_LEVEL-1];
  int_if.err = err[KARATSUBA_LEVEL-1];
  int_if.mod = 0;
  int_if.sop = 0;
  int_if.eop = 0;
-  int_if.ctl = 0;
 end

 secp256k1_mod #(
-  .USE_MULT ( 0 )
+  .USE_MULT ( 0        ),
+  .CTL_BITS ( CTL_BITS )
 )
 secp256k1_mod (
  .i_clk( i_clk       ),
  .i_rst( i_rst       ),
  .i_dat( int_if.dat  ),
  .i_val( int_if.val  ),
+  .i_ctl( int_if.ctl  ),
  .i_err( int_if.err  ),
  .o_rdy( int_if.rdy  ),
  .o_dat( o_dat ),
+  .o_ctl( o_ctl ),
  .o_err( o_err ),
  .i_rdy( i_rdy ),
  .o_val( o_val )
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_pkg.sv
@ -30,4 +30,51 @@ package secp256k1_pkg;
  
  parameter [255:0] p_eq =  (1 << 256) - (1 << 32) - (1 << 9) - (1 << 8) - (1 << 7) - (1 << 6) - (1 << 4) - 1;
  
+  // Use register map for debug, holds information on current operation
+  parameter REGISTER_SIZE = 64;
+  // The mapping to index
+  parameter CURR_CMD = 0;     // What command are we processing
+  parameter CURR_STATE = 1;   // What state are we in
+  // If it is processing a signature verification, these bits will be populated:
+  parameter SIG_VER_HASH = 8; // 256 bits
+  parameter SIG_VER_S = 12;   // 256 bits
+  parameter SIG_VER_R = 16;   // 256 bits
+  parameter SIG_VER_Q = 20;   // 512 bits
+  parameter SIG_VER_W = 28;   // 256 bits - Result of invert(s)
+  
+  // Expected to be in Jacobian coordinates
+  typedef struct packed {
+    logic [255:0] x, y, z;
+  } jb_point_t;
+  
+  typedef struct packed {
+    logic [5:0] padding;
+    logic X_INFINITY_POINT;
+    logic OUT_OF_RANGE_S;
+    logic OUT_OF_RANGE_R;
+  } secp256k1_ver_t; 
+  
+  function is_zero(jb_point_t p);
+    is_zero = (p.x == 0 && p.y == 0 && p.z == 1);
+  endfunction
+  
+  // Function to double point in Jacobian coordinates (for comparison in testbench)
+  // Here a is 0, and we also mod p the result
+  function jb_point_t dbl_jb_point(jb_point_t p);
+    logic [1023:0] A, B, C, D;
+    A = (p.y*p.y) % p_eq;
+    B = (4*p.x*A) % p_eq;
+    C = (8*A*A) % p_eq;
+    D = (3*p.x*p.x) % p_eq;
+    dbl_jb_point.x = (D*D - 2*B) % p_eq;
+    dbl_jb_point.y = (D*(B-dbl_jb_point.x) - C) % p_eq;
+    dbl_jb_point.z = (2*p.y*p.z) % p_eq;
+  endfunction
+  
+  function print_jb_point(jb_point_t p);
+    $display("x:%h", p.x);
+    $display("y:%h", p.y);
+    $display("z:%h", p.z);
+  endfunction
+  
 endpackage
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_dbl.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_dbl.sv
@ -0,0 +1,258 @@
+/*
+  This performs point doubling.
+ 
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module secp256k1_point_dbl
+  import secp256k1_pkg::*;
+#(
+)(
+  input i_clk, i_rst,
+  // Input point
+  input jb_point_t i_p,
+  input logic   i_val,
+  output logic  o_rdy,
+  // Output point
+  output jb_point_t o_p,
+  input logic    i_rdy,
+  output logic   o_val,
+  output logic   o_err,
+  // Interface to 256bit multiplier (mod p)
+  if_axi_stream.source o_mult_if,
+  if_axi_stream.source i_mult_if,
+  // Interface to only mod reduction block
+  if_axi_stream.source o_mod_if,
+  if_axi_stream.source i_mod_if
+);
+
+/*
+ * These are the equations that need to be computed, they are issued as variables
+ * become valid. We have a bitmask to track what equation results are valid which
+ * will trigger other equations. [] show what equations must be valid before this starts.
+ * 
+ * 0.    A = (i_p.y)^2 mod p
+ * 1.    B = (i_p.x)*A mod p [eq0]
+ * 2.    B = 4*B mod p [eq1]
+ * 3.    C = A^2 mod p [eq0]
+ * 4.    C = C*8 mod p [eq3]
+ * 5.    D = (i_p.x)^2 mod p
+ * 6.    D = 3*D mod p [eq5]
+ * 7.   (o_p.x) = D^2 mod p[eq6]
+ * 8.    E = 2*B mod p [eq2]
+ * 9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
+ * 10   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
+ * 11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
+ * 12.   (o_p.y) = (o_p.y) - C mod p [eq11]
+ * 13.   (o_p.z) = 2*(i_p.y) mod p
+ * 14.   (o_p.z) = o_p.y * i_p.z mod p [eq14]
+ */
+logic [14:0] eq_val, eq_wait;
+
+// Temporary variables
+logic [255:0] A, B, C, D, E;
+jb_point_t i_p_l;
+
+enum {IDLE, START, FINISHED} state;
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_val <= 0;
+    o_rdy <= 0;
+    o_p <= 0;
+    o_mult_if.reset_source();
+    o_mod_if.reset_source();
+    i_mult_if.rdy <= 0;
+    i_mod_if.rdy <= 0;
+    eq_val <= 0;
+    state <= IDLE;
+    eq_wait <= 0;
+    i_p_l <= 0;
+    o_err <= 0;
+    A <= 0;
+    B <= 0;
+    C <= 0;
+    D <= 0;
+    E <= 0;
+  end else begin
+    if (o_mult_if.rdy) 
+      o_mult_if.val <= 0;
+    if (o_mod_if.rdy)
+      o_mod_if.val <= 0;
+    case(state)
+      {IDLE}: begin
+        o_rdy <= 1;
+        eq_val <= 0;
+        eq_wait <= 0;
+        o_err <= 0;
+        i_mult_if.rdy <= 1;
+        i_p_l <= i_p;
+        A <= 0;
+        B <= 0;
+        C <= 0;
+        D <= 0;
+        E <= 0;
+        if (i_val && o_rdy) begin
+          state <= START;
+          o_rdy <= 0;
+          if (i_p.z == 0) begin
+            o_err <= 1;
+            state <= IDLE;
+          end
+        end
+      end
+      // Just a big if tree where we issue equations if the required inputs
+      // are valid
+      {START}: begin
+        i_mod_if.rdy <= 1;
+        i_mult_if.rdy <= 1;
+
+        // Check any results from multiplier
+        if (i_mod_if.val && i_mod_if.rdy) begin
+          eq_val[i_mod_if.ctl] <= 1;
+          case(i_mod_if.ctl)
+            2: B <= i_mod_if.dat;
+            4: C <= i_mod_if.dat;
+            8: E <= i_mod_if.dat;
+            13: o_p.z <= i_mod_if.dat;
+            default: o_err <= 1;
+          endcase
+        end
+        
+        // Check any results from multiplier
+        if (i_mult_if.val && i_mult_if.rdy) begin
+          eq_val[i_mult_if.ctl] <= 1;
+          case(i_mult_if.ctl) inside
+            0: A <= i_mult_if.dat;
+            1: B <= i_mult_if.dat;
+            3: C <= i_mult_if.dat;
+            5: D <= i_mult_if.dat;
+            6: D <= i_mult_if.dat;
+            7: o_p.x <= i_mult_if.dat;
+            11: o_p.y <= i_mult_if.dat;
+            14: o_p.z <= i_mult_if.dat;
+            default: o_err <= 1;
+          endcase
+        end      
+         
+        
+        // Issue new multiplies
+        if (~eq_wait[0]) begin              //0.    A = (i_p.y)^2 mod p
+          multiply(0, i_p_l.y, i_p_l.y);
+        end else
+        if (eq_val[0] && ~eq_wait[1]) begin //1.    B = (i_p.x)*A mod p [eq0]
+          multiply(1, i_p_l.x, A);
+        end else
+        if (eq_val[0] && ~eq_wait[3]) begin //3.    C = A^2 mod p [eq0]
+          multiply(3, A, A);
+        end else
+        if (~eq_wait[5]) begin              //5.    D = (i_p.x)^2 mod p
+          multiply(5, i_p_l.x, i_p_l.x);
+        end else
+        if (eq_val[5] && ~eq_wait[6]) begin //6.    D = 3*D mod p [eq5]
+          multiply(6, 256'd3, D);
+        end else
+        if (eq_val[6] && ~eq_wait[7]) begin //7.   (o_p.x) = D^2 mod p[eq6]
+          multiply(7, D, D);
+        end else
+        if (eq_val[10] && eq_val[6] && ~eq_wait[11]) begin //11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
+          multiply(11, D, o_p.y);
+        end else
+        if (eq_val[13] && ~eq_wait[14]) begin //14.   (o_p.z) = o_p.z * i_p.z mod p [eq13]
+          multiply(14, i_p_l.z, o_p.z);
+        end
+                
+        // Issue new modulo reductions
+        if (eq_val[1] && ~eq_wait[2]) begin //2.    B = 4*B mod p [eq1]
+          modulo(2, B*4);
+        end else
+        if (eq_val[3] && ~eq_wait[4]) begin //4.    C = C*8 mod p [eq3]
+          modulo(4, C*8);
+        end else
+        if (eq_val[2] && ~eq_wait[8]) begin //8.    E = 2*B mod p [eq2]
+          modulo(8, B*2);
+        end else
+        if (~eq_wait[13]) begin            //13.   (o_p.z) = 2*(i_p.y) mod p
+          modulo(13, 2*i_p_l.y);
+        end
+        
+        // Additions / subtractions we do in-module
+        if (eq_val[8] && eq_val[7] && ~eq_wait[9]) begin //9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
+          eq_wait[9] <= 1;
+          eq_val[9] <= 1;
+          o_p.x <= o_p.x + (E > o_p.x ? secp256k1_pkg::p : 0) - E;
+        end
+        
+        if (eq_val[9] && eq_val[2] && ~eq_wait[10]) begin //10.   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
+          eq_wait[10] <= 1;
+          eq_val[10] <= 1;
+          o_p.y <= B + (o_p.x > B ? secp256k1_pkg::p : 0) - o_p.x;
+        end
+        
+        
+        if (eq_val[11] && ~eq_wait[12]) begin //12.   (o_p.y) = (o_p.y) - C mod p [eq11]
+          eq_wait[12] <= 1;
+          eq_val[12] <= 1;
+          o_p.y <= o_p.y + (C > o_p.y ? secp256k1_pkg::p : 0) - C;
+        end
+        
+        if (&eq_val) begin
+          state <= FINISHED;
+          o_val <= 1;
+        end
+      end
+      {FINISHED}: begin
+        if (o_val && i_rdy) begin
+          state <= IDLE;
+          o_val <= 0;
+          o_rdy <= 1;
+        end
+      end
+    endcase
+    
+    if (o_err) begin
+      o_val <= 1;
+      if (o_val && i_rdy) begin
+        o_err <= 0;
+        state <= IDLE;
+      end
+    end
+    
+  end
+end
+
+// Task for using multiplies
+task multiply(input int unsigned ctl, input logic [255:0] a, b);
+  if (~o_mult_if.val || (o_mult_if.val && o_mult_if.rdy)) begin
+    o_mult_if.val <= 1;
+    o_mult_if.dat[0 +: 256] <= a;
+    o_mult_if.dat[256 +: 256] <= b;
+    o_mult_if.ctl <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for using modulo
+task modulo(input int unsigned ctl, input logic [512:0] a);
+  if (~o_mod_if.val || (o_mod_if.val && o_mod_if.rdy)) begin
+    o_mod_if.val <= 1;
+    o_mod_if.dat <= a;
+    o_mod_if.ctl <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+
+endmodule
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_point_mult.sv
@ -0,0 +1,180 @@
+/*
+  This performs point multiplication. We use the standard double
+  and add algorithm.
+ 
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module secp256k1_point_mult
+  import secp256k1_pkg::*;
+#(
+)(
+  input i_clk, i_rst,
+  // Input point and value to multiply
+  input jb_point_t    i_p,
+  input logic [255:0] i_k,
+  input logic   i_val,
+  output logic  o_rdy,
+  // Output point
+  output jb_point_t o_p,
+  input logic    i_rdy,
+  output logic   o_val,
+  output logic   o_err
+);
+
+if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(i_clk);
+if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(i_clk);
+
+if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mod_in_if(i_clk);
+if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mod_out_if(i_clk);
+
+logic [255:0] k_l;
+jb_point_t p_n, p_q, p_dbl;
+logic p_dbl_in_val, p_dbl_in_rdy, p_dbl_out_err, p_dbl_out_val, p_dbl_out_rdy;
+
+enum {IDLE, DOUBLE, ADD, FINISHED} state;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_val <= 0;
+    o_err <= 0;
+    o_rdy <= 0;
+    k_l <= 0;
+    p_q <= 0;
+    p_dbl_in_val <= 0;
+    p_dbl_out_rdy <= 0;
+    state <= IDLE;
+    o_p <= 0;
+    p_n <= 0;
+  end else begin
+    p_dbl_in_val <= 0;
+    p_dbl_out_rdy <= 1;
+    case (state)
+      {IDLE}: begin
+        o_rdy <= 1;
+        o_err <= 0;
+        p_q <= {x:0, y:0, z:1};  // p_q starts at 0
+        if (o_rdy && i_val) begin
+          k_l <= i_k;
+          p_n <= i_p;
+          // Regardless of i_k[0] we skip the first add since it would set p_q to i_p
+          if (i_k[0]) begin
+            p_q <= i_p;
+          end
+          state <= DOUBLE;
+          p_dbl_in_val <= 1;
+        end
+      end
+      {DOUBLE}: begin
+        if(p_dbl_in_val && p_dbl_in_rdy) begin
+          p_dbl_in_val <= 0;
+        end
+        if (p_dbl_out_val && p_dbl_out_rdy) begin
+          p_n <= p_dbl;
+          k_l <= k_l >> 1;
+          if (k_l[1] == 1) begin
+            state <= ADD;
+          end else if (k_l[255:1] == 0) begin
+            state <= FINISHED;
+            o_p <= p_dbl;
+            o_val <= 1;
+          end else begin
+            state <= DOUBLE;
+            p_dbl_in_val <= 1;
+          end        
+        end
+      end
+      {ADD}: begin
+        state <= DOUBLE;
+        p_q <= p_n;
+        p_dbl_in_val <= 1;
+      end
+      {FINISHED}: begin
+        if (i_rdy && o_val) begin
+          o_val <= 0;
+          state <= IDLE;
+        end
+      end      
+    endcase
+    
+    if (p_dbl_out_err) begin
+      o_err <= 1;
+      o_val <= 1;
+      state <= FINISHED;
+    end  
+    
+  end  
+end
+
+secp256k1_point_dbl secp256k1_point_dbl(
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  // Input point
+  .i_p   ( p_n           ),
+  .i_val ( p_dbl_in_val  ),
+  .o_rdy ( p_dbl_in_rdy  ),
+  // Output point
+  .o_p   ( p_dbl         ),
+  .o_err ( p_dbl_out_err ),
+  .i_rdy ( p_dbl_out_rdy ),
+  .o_val ( p_dbl_out_val ),
+  // Interfaces to shared multipliers / modulo blocks
+  .o_mult_if ( mult_in_if  ),
+  .i_mult_if ( mult_out_if ),
+  .o_mod_if  ( mod_in_if   ),
+  .i_mod_if  ( mod_out_if  )
+);
+
+
+secp256k1_mult_mod #(
+  .CTL_BITS ( 8 )
+)
+secp256k1_mult_mod (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_dat_a ( mult_in_if.dat[0 +: 256] ),
+  .i_dat_b ( mult_in_if.dat[256 +: 256] ),
+  .i_val ( mult_in_if.val ),
+  .i_err ( mult_in_if.err ),
+  .i_ctl ( mult_in_if.ctl ),
+  .o_rdy ( mult_in_if.rdy ),
+  .o_dat ( mult_out_if.dat ),
+  .i_rdy ( mult_out_if.rdy ),
+  .o_val ( mult_out_if.val ),
+  .o_ctl ( mult_out_if.ctl ),
+  .o_err ( mult_out_if.err ) 
+);
+
+secp256k1_mod #(
+  .USE_MULT ( 0 ),
+  .CTL_BITS ( 8 )
+)
+secp256k1_mod (
+  .i_clk( i_clk     ),
+  .i_rst( i_rst     ),
+  .i_dat( mod_in_if.dat  ),
+  .i_val( mod_in_if.val  ),
+  .i_err( mod_in_if.err  ),
+  .i_ctl( mod_in_if.ctl  ),
+  .o_rdy( mod_in_if.rdy  ),
+  .o_dat( mod_out_if.dat ),
+  .o_ctl( mod_out_if.ctl ),
+  .o_err( mod_out_if.err ),
+  .i_rdy( mod_out_if.rdy ),
+  .o_val( mod_out_if.val )
+);
+
+endmodule
--- a/zcash_fpga/src/rtl/secp256k1/secp256k1_top.sv
+++ b/zcash_fpga/src/rtl/secp256k1/secp256k1_top.sv
@ -1,12 +1,241 @@
 module secp256k1_top (
  input          i_clk,
  input          i_rst,
-  input                i_val,
-  output logic         o_rdy,
-  output logic         o_val
-  
-  
+  // Command interface
+  if_axi_stream.sink if_cmd_rx,
+  if_axi_stream.source if_cmd_tx,
+  // Memory map interface for debug
+  if_axi_mm.sink if_axi_mm        
 );

+import secp256k1_pkg::*;
+import zcash_fpga_pkg::*;
+
+// Register map is used for storing command data
+logic [REGISTER_SIZE/64-1:0][63:0] register_map;
+
+if_ram #(.RAM_WIDTH(64), .RAM_DEPTH(REGISTER_SIZE)) register_file_a (i_clk, i_rst);
+if_ram #(.RAM_WIDTH(64), .RAM_DEPTH(REGISTER_SIZE)) register_file_b (i_clk, i_rst);
+
+// 256 multiplier (karatsuba)
+logic [255:0] mult_dat_a, mult_dat_b;
+logic mult_dat_val;
+if_axi_stream #(.DAT_BYTS(512/8)) mult_out_if(i_clk);
+
+// 256 bit inverse calculation
+if_axi_stream #(.DAT_BYTS(256/8)) bin_inv_in_if(i_clk);
+if_axi_stream #(.DAT_BYTS(256/8)) bin_inv_out_if(i_clk);
+
+// TODO just have one multiplier (unless doulbe & add is parallel)
+//one multiplier that barret reduction can share?
+  
+// Can avoid final inverstion converting from projected coord by some check in c++ code
+
+// Controlling state machine
+typedef enum {IDLE,
+              VERIFY_SECP256K1_SIG_PARSE,           // Parse inputs
+              CALC_S_INV,
+              POINT_DBL,
+              POINT_ADD,
+              IGNORE,
+              FINISHED} secp256k1_state_t;
+
+secp256k1_state_t secp256k1_state;
+header_t header, header_l;
+secp256k1_ver_t secp256k1_ver;
+// Other temporary values
+logic [255:0] r, w;
+
+logic [5:0] cnt; // Counter for parsing command inputs
+logic if_axi_mm_rd;
+
+always_comb begin
+  header = if_cmd_rx.dat;
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    secp256k1_state <= IDLE;
+    if_cmd_tx.reset_source();
+    if_cmd_rx.reset_sink();
+    cnt <= 0;
+    mult_out_if.rdy <= 0;
+    register_file_a.reset_source();
+    mult_dat_a <= 0;
+    mult_dat_b <= 0;
+    mult_dat_val <= 0;
+    w <= 0;
+    r <= 0;
+    bin_inv_in_if.reset_source();
+    bin_inv_out_if.rdy <= 0;
+    secp256k1_ver <= 0;
+  end else begin
+  
+    register_file_a.en <= 1;
+    register_file_a.wr <= 0;
+    register_file_a.rd <= 1;
+    mult_out_if.rdy <= 1;
+    bin_inv_out_if.rdy <= 1;
+    mult_dat_val <= 0;
+    
+    case(secp256k1_state)
+      {IDLE}: begin
+        secp256k1_ver <= 0;
+        if_cmd_rx.rdy <= 1;
+        header_l <= header;
+        cnt <= 0;
+        if (if_cmd_rx.val && if_cmd_rx.rdy) begin
+          case(header.cmd)
+            {VERIFY_SECP256K1_SIG}: begin
+              register_map[CURR_CMD] <= header;
+              secp256k1_state <= VERIFY_SECP256K1_SIG_PARSE;
+            end
+            default: begin
+              if (~if_cmd_rx.eop) begin
+                if_cmd_rx.rdy <= 1;
+                secp256k1_state <= IGNORE;
+              end
+            end
+          endcase
+        end
+      end
+      {VERIFY_SECP256K1_SIG_PARSE}: begin
+        if_cmd_rx.rdy <= 1;
+        if (if_cmd_rx.val && if_cmd_rx.rdy) begin
+          register_file_a.wr <= 1;
+          cnt <= cnt + 1;
+          if (cnt == 19) secp256k1_state <= CALC_S_INV;
+        end
+        
+        if (bin_inv_in_if.val && bin_inv_in_if.rdy)
+          bin_inv_in_if.val <= 0;
+        
+        case(cnt) inside
+          [0:3]: begin
+            register_file_a.a <= SIG_VER_S + (cnt % 4);
+            register_file_a.d <= if_cmd_rx.dat;
+            // Can start calculating the inverse here
+            bin_inv_in_if.dat[(cnt % 4)*64 +: 64] <= if_cmd_rx.dat;
+            if (cnt == 3) begin
+              bin_inv_in_if.val <= 1;
+            end
+            end
+          [4:7]: begin
+            // We can load R into the karatsuba multiplier
+            register_file_a.a <= SIG_VER_R + (cnt % 4);
+            register_file_a.d <= if_cmd_rx.dat;
+            mult_dat_a[(cnt % 4)*64 +: 64] <= if_cmd_rx.dat;
+          end
+          [8:11]: begin
+            register_file_a.a <= SIG_VER_HASH + (cnt % 4);
+            register_file_a.d <= if_cmd_rx.dat;
+          end
+          [12:19]: begin
+            register_file_a.a <= SIG_VER_Q + (cnt % 8);
+            register_file_a.d <= if_cmd_rx.dat;
+          end
+        endcase
+      end
+      {CALC_S_INV}: begin
+        // Wait until bin_inv_out_if.val
+        if (bin_inv_in_if.dat >= secp256k1_pkg::n) secp256k1_ver.OUT_OF_RANGE_S <= 1;
+        if (mult_dat_a >= secp256k1_pkg::n) secp256k1_ver.OUT_OF_RANGE_R <= 1;
+        if (bin_inv_out.val && bin_inv_out.rdy) begin
+          w <= bin_inv_out.dat;
+          // TODO also write this to RAM
+          // need to do 2 multiplications % n to get u1 and u2
+        end
+      end
+      {IGNORE}: begin
+        if_cmd_rx.rdy <= 1;
+        if (if_cmd_rx.rdy && if_cmd_rx.val && if_cmd_rx.eop)
+          secp256k1_state <= IDLE;
+      end
+    endcase
+  end
+end
+
+// TODO could provide write access
+
+always_comb begin
+
+end
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    if_axi_mm.reset_sink();
+    register_file_b.reset_source();
+  end else begin
+    if_axi_mm.rd_dat_val <= 0;         
+    register_file_b.en <= 1;
+    register_file_b.rd <= 1;
+    register_file_b.a <= if_axi_mm.addr/8;
+    if_axi_mm_rd <= if_axi_mm.rd;
+    if (if_axi_mm_rd) begin
+      if_axi_mm.rd_dat_val <= 1;
+      if_axi_mm.rd_dat <= register_file_b.q;    
+    end
+  end
+end
+
+// BRAM for storing parsed inputs
+bram #(
+  .RAM_WIDTH       ( 64                 ),
+  .RAM_DEPTH       ( REGISTER_SIZE      ),
+  .RAM_PERFORMANCE ( "HIGH_PERFORMANCE" )
+) register_file (
+  .a ( register_file_a ),
+  .b ( register_file_b )
+);
+
+// Calculate binary inverse mod n
+begin: BINARY_INVERSE_MOD_N
+  bin_inv #(
+    .BITS ( 256              ),
+    .P    ( secp256k1_pkg::n )
+  )(
+    .i_clk ( i_clk ),
+    .i_rst ( i_rst) ,
+    .i_dat ( bin_inv_in_if.dat ),
+    .i_val ( bin_inv_in_if.val ),
+    .o_rdy ( bin_inv_in_if.rdy ),
+    .o_dat ( bin_inv_out_if.dat ),
+    .o_val ( bin_inv_out_if.val ),
+    .i_rdy ( bin_inv_out_if.rdy )
+  );
+end
+
+// 256 bit Karatsuba_ofman multiplier
+begin: KARATSUBA_OFMAN_MULT
+  localparam KARATSUBA_LEVEL = 2;
+  logic [KARATSUBA_LEVEL-1:0] val;
+  
+  karatsuba_ofman_mult # (
+    .BITS  ( 256             ),
+    .LEVEL ( KARATSUBA_LEVEL )
+  )
+  karatsuba_ofman_mult (
+    .i_clk  ( i_clk           ),
+    .i_dat_a( mult_dat_a      ),
+    .i_dat_b( mult_dat_b      ),  
+    .o_dat  ( mult_out_if.dat )
+  );
+  
+  always_ff @ (posedge i_clk) begin
+    if (i_rst) begin
+      mult_out_if.val <= 0;
+    end else begin
+      val <= {val, mult_dat_val};
+    end
+  end
+end
+
+// Modulo p reducer (shared with arbitrator)
+
+// Modulo n reducer (output from karatsuba multiplier)
+
+// 256 bit Karatsuba_ofman multiplier (shared with arbitrator)
+
+// Point double module or Point multiply module
+
  
 endmodule
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@ -24,7 +24,7 @@ package zcash_fpga_pkg;
  import equihash_pkg::N;
  import equihash_pkg::K;
  
-  parameter FPGA_VERSION = 32'h0;
+  parameter FPGA_VERSION = 32'h01_00_00;  //v1.0.0
  localparam [63:0] FPGA_CMD_CAP = {{62'd0},
                                    (equihash_pkg::N == 144 && equihash_pkg::K == 5),       // N = 144, K = 5 for VERIFY_EQUIHASH command
                                    (equihash_pkg::N == 200 && equihash_pkg::K == 9)};      // N = 200, K = 9 for VERIFY_EQUIHASH command
@ -36,6 +36,7 @@ package zcash_fpga_pkg;
    RESET_FPGA            = 'h0000_00_00,
    FPGA_STATUS           = 'h0000_00_01,
    VERIFY_EQUIHASH       = 'h0000_01_00,
+    VERIFY_SECP256K1_SIG  = 'h0000_01_01,
    
    // Replies from the FPGA
    RESET_FPGA_RPL      = 'h80_00_00_00,
--- a/zcash_fpga/src/tb/secp256k1_point_dbl_tb.sv
+++ b/zcash_fpga/src/tb/secp256k1_point_dbl_tb.sv
@ -0,0 +1,165 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module secp256k1_point_dbl_tb ();
+import common_pkg::*;
+import secp256k1_pkg::*;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+if_axi_stream #(.DAT_BYTS(256*3/8)) in_if(clk); // Point is X, Y, Z
+if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk);
+
+if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(clk);
+if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(clk);
+
+if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mod_in_if(clk);
+if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mod_out_if(clk);
+
+
+jb_point_t in_p, out_p;
+
+always_comb begin
+  in_p = in_if.dat; 
+  out_if.dat = out_p;
+end
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err)
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+
+secp256k1_point_dbl secp256k1_point_dbl(
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input point
+  .i_p   ( in_p      ),
+  .i_val ( in_if.val ),
+  .o_rdy ( in_if.rdy ),
+  .o_p   ( out_p     ),
+  .o_err ( out_if.err ),
+  .i_rdy ( out_if.rdy ),
+  .o_val  ( out_if.val ) ,
+  .o_mult_if ( mult_in_if ),
+  .i_mult_if ( mult_out_if ),
+  .o_mod_if ( mod_in_if ),
+  .i_mod_if ( mod_out_if )
+);
+
+// Attach a mod reduction unit and multiply - mod unit
+// In full design these could use dedicated multipliers or be arbitrated
+secp256k1_mult_mod #(
+  .CTL_BITS ( 8 )
+)
+secp256k1_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_dat_a ( mult_in_if.dat[0 +: 256] ),
+  .i_dat_b ( mult_in_if.dat[256 +: 256] ),
+  .i_val ( mult_in_if.val ),
+  .i_err ( mult_in_if.err ),
+  .i_ctl ( mult_in_if.ctl ),
+  .o_rdy ( mult_in_if.rdy ),
+  .o_dat ( mult_out_if.dat ),
+  .i_rdy ( mult_out_if.rdy ),
+  .o_val ( mult_out_if.val ),
+  .o_ctl ( mult_out_if.ctl ),
+  .o_err ( mult_out_if.err ) 
+);
+
+secp256k1_mod #(
+  .USE_MULT ( 0 ),
+  .CTL_BITS ( 8 )
+)
+secp256k1_mod (
+  .i_clk( clk       ),
+  .i_rst( rst       ),
+  .i_dat( mod_in_if.dat  ),
+  .i_val( mod_in_if.val  ),
+  .i_err( mod_in_if.err  ),
+  .i_ctl( mod_in_if.ctl  ),
+  .o_rdy( mod_in_if.rdy  ),
+  .o_dat( mod_out_if.dat ),
+  .o_ctl( mod_out_if.ctl ),
+  .o_err( mod_out_if.err ),
+  .i_rdy( mod_out_if.rdy ),
+  .o_val( mod_out_if.val )
+);
+
+task test_0();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
+  logic [255:0] in_a, in_b;
+  jb_point_t p_in, p_exp, p_out;
+  $display("Running test_0...");
+  p_in = {z:1, x:2, y:3};
+  p_exp = dbl_jb_point(p_in);
+  
+  fork
+    in_if.put_stream(p_in, 256*3/8);
+    out_if.get_stream(get_dat, get_len);
+  join
+  
+  p_out = get_dat;
+  
+  if (p_exp != p_out) begin
+    $display("Expected:");
+    print_jb_point(p_exp);
+    $display("Was:");
+    print_jb_point(p_out);
+    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
+  end 
+  
+  $display("test_0 PASSED");
+end
+endtask;
+
+function compare_point();
+  
+endfunction
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+  
+  test_0();
+
+  #1us $finish();
+end
+endmodule
--- a/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv
+++ b/zcash_fpga/src/tb/secp256k1_point_mult_tb.sv
@ -0,0 +1,119 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module secp256k1_point_mult_tb ();
+import common_pkg::*;
+import secp256k1_pkg::*;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+if_axi_stream #(.DAT_BYTS(256*3/8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk);
+
+jb_point_t in_p, out_p;
+logic [255:0] k;
+
+always_comb begin
+  in_p = in_if.dat; 
+  out_if.dat = out_p;
+end
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err) begin
+    out_if.rdy = 1;
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+  end
+
+
+secp256k1_point_mult secp256k1_point_mult (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_p   ( in_if.dat  ),
+  .i_k   ( k          ),
+  .i_val ( in_if.val  ),
+  .o_rdy ( in_if.rdy  ),
+  .o_p   ( out_p      ),
+  .i_rdy ( out_if.rdy ),
+  .o_val ( out_if.val ),
+  .o_err ( out_if.err )
+);
+
+task test_0();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
+  logic [255:0] in_a, in_b;
+  jb_point_t p_in, p_exp, p_out;
+  $display("Running test_0...");
+  p_in = {z:1, x:2, y:3};
+  k = 100;
+  //p_exp = dbl_jb_point(p_in);
+  
+  fork
+    in_if.put_stream(p_in, 256*3/8);
+    out_if.get_stream(get_dat, get_len);
+  join
+  
+  /*p_out = get_dat;
+  
+  if (p_exp != p_out) begin
+    $display("Expected:");
+    print_jb_point(p_exp);
+    $display("Was:");
+    print_jb_point(p_out);
+    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
+  end */
+
+  $display("test_0 PASSED");
+end
+endtask;
+
+function compare_point();
+  
+endfunction
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+  
+  test_0();
+
+  #1us $finish();
+end
+endmodule