updates to structure for eliptical curve blocks

2019-05-22 14:53:06 +08:00 · 2019-05-22 14:53:06 +08:00 · 9d2c9340a3
parent 9145fd8533
commit 9d2c9340a3
6 changed files with 562 additions and 32 deletions
--- a/README.md
+++ b/README.md
@ -6,16 +6,16 @@ Repo for Zcash FPGA projects code and documents.

 These have been designed targetted for Xilinx boards (US+) and therefore contain Xilinx-specific IP.

-## zcash_verif
+## zcash_fpga_top

-This is the top level for the Zcash verification FPGA board. It targets both Xilinx Virtex UltraScale+ FPGA VCU118 Evaluation Kit, and Amazon EC2 F1 Instances.
+This is the top level for the Zcash FPGA. It targets both Xilinx Virtex UltraScale+ FPGA VCU118 Evaluation Kit, and Amazon EC2 F1 Instances.

 Architecture document is [here]()

-Contains the following top-level engine:
-### Equihash verification engine
-### EC secp256k1 signature verification engine
-### EC bls12-381 co-processor
+It optionally contains the following top-level engines (you can optionally include in a build via parameters):
+* Equihash verification engine
+* EC secp256k1 signature verification engine
+* EC bls12-381 co-processor


 ## ip_cores
@ -25,4 +25,7 @@ These contain custom IP cores used in the projects in this repo.
 * blake2b - A simple implementation of blake2b and a pipline-unrolled version for high performance.
 * common - Packages and interfaces that are shared.
 * fifo - Fifo implementations
-* parsing - Blocks for parsing/processing streams, as well as testbench files.
+* parsing - Blocks for parsing/processing streams, as well as testbench files.
+* Karabutsa multiplier
+* Barret reduction
+* Resource arbitrators
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fp_mult_mod.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fp_mult_mod.sv
@ -1,7 +1,6 @@
 /*
  This performs a multiplication followed by modular reduction
-  using karabusa multiplier and barrets algorithm, for the bls381-12
-  curve on Fp elements.
+  using karabusa multiplier and barrets algorithm on Fp elements.

  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation

@ -19,41 +18,48 @@
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

-module bls12_381_fp_mult_mod #(
-  parameter CTL_BITS = 16
+module ec_fp_mult_mod #(
+  parameter                DAT_BITS,
+  parameter [DAT_BITS-1:0] P,
+  parameter                KARATSUBA_LVL,
+  parameter                CTL_BITS = 16
 )(
  input i_clk, i_rst,
  // Input value
-  input [380:0]        i_dat_a,
-  input [380:0]        i_dat_b,
+  input [DAT_BITS-1:0] i_dat_a,
+  input [DAT_BITS-1:0] i_dat_b,
+  input
  input                i_val,
  input                i_err,
  input [CTL_BITS-1:0] i_ctl,
  output logic         o_rdy,
  // output
-  output logic [380:0]        o_dat,
+  output logic [DAT_BITS-1:0] o_dat,
  output logic [CTL_BITS-1:0] o_ctl,
  input                       i_rdy,
  output logic                o_val
 );

-localparam ARB_BIT = 8;
-localparam KARATSUBA_LEVEL = 3;
-import bls12_381_pkg::*;
+// The reduction mod takes DAT_BITS + 1 bits, but we also need to make sure we are a multiple of KARATSUBA_LVL*2
+parameter MLT_BITS = DAT_BITS + 1 + (KARATSUBA_LVL*2 - (DAT_BITS + 1) % (KARATSUBA_LVL*2));

-if_axi_stream #(.DAT_BYTS(384*2/8), .CTL_BITS(CTL_BITS)) mult_if_in [3:0] (i_clk);
-if_axi_stream #(.DAT_BYTS(384*2/8), .CTL_BITS(CTL_BITS)) mult_if_out [3:0] (i_clk);
+localparam ARB_BIT = 8;
+
+if_axi_stream #(.DAT_BITS(MLT_BITS*2), .CTL_BITS(CTL_BITS)) mult_if_in [3:0] (i_clk);
+if_axi_stream #(.DAT_BITS(MLT_BITS*2), .CTL_BITS(CTL_BITS)) mult_if_out [3:0] (i_clk);

 always_comb begin
  mult_if_in[0].mod = 0;
  mult_if_in[0].sop = 1;
  mult_if_in[0].eop = 1;
-  mult_if_in[0].dat = {3'd0, i_dat_b, 3'd0, i_dat_a};
+  mult_if_in[0].dat = 0;
+  mult_if_in[0].dat[0 +: DAT_BITS] = i_dat_a;
+  mult_if_in[0].dat[MLT_BITS +: DAT_BITS] = i_dat_b;
  mult_if_in[0].val = i_val;
  mult_if_in[0].err = i_err;
  mult_if_in[0].ctl = i_ctl;
  o_rdy = mult_if_in[0].rdy;
-  
+
  mult_if_out[3].sop = 1;
  mult_if_out[3].eop = 1;
  mult_if_out[3].mod = 0;
@ -77,16 +83,16 @@ resource_share_mod (
 );

 karatsuba_ofman_mult # (
-  .BITS     ( 384             ),
-  .LEVEL    ( KARATSUBA_LEVEL ),
-  .CTL_BITS ( CTL_BITS        )
+  .BITS     ( MLT_BITS      ),
+  .LEVEL    ( KARATSUBA_LVL ),
+  .CTL_BITS ( CTL_BITS      )
 )
 karatsuba_ofman_mult (
  .i_clk  ( i_clk ),
  .i_rst  ( i_rst ),
  .i_ctl  ( mult_if_in[3].ctl ),
-  .i_dat_a( mult_if_in[3].dat[0 +: 384]   ),
-  .i_dat_b( mult_if_in[3].dat[384 +: 384] ),
+  .i_dat_a( mult_if_in[3].dat[0 +: MLT_BITS]   ),
+  .i_dat_b( mult_if_in[3].dat[MLT_BITS +: MLT_BITS] ),
  .i_val  ( mult_if_in[3].val ),
  .o_rdy  ( mult_if_in[3].rdy ),
  .o_dat  ( mult_if_out[3].dat ),
@ -95,10 +101,12 @@ karatsuba_ofman_mult (
  .o_ctl  ( mult_if_out[3].ctl )
 );

- barret_mod_pipe #(
-  .DAT_BITS ( 384      ),
+
+
+barret_mod_pipe #(
+  .DAT_BITS ( MLT_BITS ),
  .CTL_BITS ( CTL_BITS ),
-  .P        ( bls12_381_pkg::P )
+  .P        ( P        )
 )
 barret_mod_pipe (
  .i_clk ( i_clk ),
--- a/ip_cores/ec/src/rtl/ec_fp_pkg.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_pkg.sv
@ -0,0 +1,109 @@
+/*
+  Package for Fp fields
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package ec_fp_pkg;
+
+  // Expected to be in Jacobian coordinates
+  typedef struct packed {
+    logic [255:0] x, y, z;
+  } jb_point_t;
+
+  function is_zero(jb_point_t p);
+    is_zero = (p.x == 0 && p.y == 0 && p.z == 1);
+    return is_zero;
+  endfunction
+
+  // Function to double point in Jacobian coordinates (for comparison in testbench)
+  // Here a is 0, and we also mod the result
+  function jb_point_t dbl_jb_point(jb_point_t p, input logic [1023:0] mod);
+    logic signed [1023:0] I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;
+
+    if (p.z == 0) return p;
+
+    I_X = p.x;
+    I_Y = p.y;
+    I_Z = p.z;
+    A = (I_Y*I_Y) % mod;
+    B = (((4*I_X) % mod)*A) % mod;
+    C = (((8*A) % mod)*A) % mod;
+    D = (((3*I_X)% mod)*I_X) % mod;
+    X = (D*D)% mod;
+    X = X + ((2*B) % mod > X ? mod : 0) - (2*B) % mod;
+
+    Y = (D*((B + (X > B ? mod : 0)-X) % mod)) % mod;
+    Y = Y + (C > Y ? mod : 0) - C;
+    Z = (((2*I_Y)% mod)*I_Z) % mod;
+
+    dbl_jb_point.x = X;
+    dbl_jb_point.y = Y;
+    dbl_jb_point.z = Z;
+    return dbl_jb_point;
+  endfunction
+
+  function jb_point_t add_jb_point(jb_point_t p1, p2, input logic [1023:0] mod);
+    logic signed [1023:0] A, U1, U2, S1, S2, H, H3, R;
+
+    if (p1.z == 0) return p2;
+    if (p2.z == 0) return p1;
+
+    if (p1.y == p2.y && p1.x == p2.x)
+      return (dbl_jb_point(p1));
+
+    U1 = p1.x*p2.z % mod;
+    U1 = U1*p2.z % mod;
+
+    U2 = p2.x*p1.z % mod;
+    U2 = U2 *p1.z % mod;
+    S1 = p1.y *p2.z % mod;
+    S1 = (S1*p2.z % mod) *p2.z % mod;
+    S2 = p2.y * p1.z % mod;
+    S2 = (S2*p1.z  % mod) *p1.z % mod;
+
+    H = U2 + (U1 > U2 ? mod : 0) -U1;
+    R = S2 + (S1 > S2 ? mod : 0) -S1;
+    H3 = ((H * H %mod ) * H ) % mod;
+    A = (((2*U1 % mod) *H % mod) * H % mod);
+
+    add_jb_point.z = ((H * p1.z % mod) * p2.z) % mod;
+    add_jb_point.x = R*R % mod;
+
+    add_jb_point.x = add_jb_point.x + (H3 > add_jb_point.x ? mod : 0) - H3;
+    add_jb_point.x = add_jb_point.x + (A > add_jb_point.x ? mod : 0) - A;
+
+    A = (U1*H % mod) * H % mod;
+    A = A + (add_jb_point.x > A ? mod : 0) - add_jb_point.x;
+    A = A*R % mod;
+    add_jb_point.y = S1*H3 % mod;
+
+    add_jb_point.y = A + (add_jb_point.y > A ? mod : 0) - add_jb_point.y;
+
+  endfunction
+
+  function on_curve(jb_point_t p);
+    return (p.y*p.y - p.x*p.x*p.x - secp256k1_pkg::a*p.x*p.z*p.z*p.z*p.z - secp256k1_pkg::b*p.z*p.z*p.z*p.z*p.z*p.z);
+  endfunction
+
+  function print_jb_point(jb_point_t p);
+    $display("x:%h", p.x);
+    $display("y:%h", p.y);
+    $display("z:%h", p.z);
+    return;
+  endfunction
+
+endpackage
--- a/ip_cores/ec/src/rtl/ec_fp_point_dbl.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_point_dbl.sv
@ -0,0 +1,253 @@
+/*
+  This performs point doubling on a prime field Fp using jacobian points.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fp_point_dbl
+#(
+  parameter                DAT_BITS,
+  parameter [DAT_BITS-1:0] P
+)(
+  input i_clk, i_rst,
+  // Input point
+  input jb_point_t i_p,
+  input logic      i_val,
+  output logic     o_rdy,
+  // Output point
+  output jb_point_t o_p,
+  input logic      i_rdy,
+  output logic     o_val,
+  output logic     o_err,
+  // Interface to multiplier (mod p)
+  if_axi_stream.source o_mult_if,
+  if_axi_stream.sink   i_mult_if
+);
+
+/*
+ * These are the equations that need to be computed, they are issued as variables
+ * become valid. We have a bitmask to track what equation results are valid which
+ * will trigger other equations. [] show what equations must be valid before this starts.
+ *
+ * 0.    A = (i_p.y)^2 mod p
+ * 1.    B = (i_p.x)*A mod p [eq0]
+ * 2.    B = 4*B mod p [eq1]
+ * 3.    C = A^2 mod p [eq0]
+ * 4.    C = C*8 mod p [eq3]
+ * 5.    D = (i_p.x)^2 mod p
+ * 6.    D = 3*D mod p [eq5]
+ * 7.   (o_p.x) = D^2 mod p[eq6]
+ * 8.    E = 2*B mod p [eq2]
+ * 9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
+ * 10   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
+ * 11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
+ * 12.   (o_p.y) = (o_p.y) - C mod p [eq11, eq4]
+ * 13.   (o_p.z) = 2*(i_p.y) mod p
+ * 14.   (o_p.z) = o_p.y * i_p.z mod p [eq14]
+ */
+logic [14:0] eq_val, eq_wait;
+
+// Temporary variables
+logic [DAT_BITS-1:0] A, B, C, D, E;
+jb_point_t i_p_l;
+
+always_comb begin
+  o_mult_if.sop = 1;
+  o_mult_if.eop = 1;
+  o_mult_if.err = 0;
+  o_mult_if.mod = 0;
+end
+
+enum {IDLE, START, FINISHED} state;
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_val <= 0;
+    o_rdy <= 0;
+    o_p <= 0;
+    o_mult_if.val <= 0;
+    o_mult_if.ctl <= 0;
+    o_mult_if.dat <= 0;
+    i_mult_if.rdy <= 0;
+    i_mod_if.rdy <= 0;
+    eq_val <= 0;
+    state <= IDLE;
+    eq_wait <= 0;
+    i_p_l <= 0;
+    o_err <= 0;
+    A <= 0;
+    B <= 0;
+    C <= 0;
+    D <= 0;
+    E <= 0;
+  end else begin
+
+    if (o_mult_if.rdy) o_mult_if.val <= 0;
+
+    case(state)
+      {IDLE}: begin
+        o_rdy <= 1;
+        eq_val <= 0;
+        eq_wait <= 0;
+        o_err <= 0;
+        i_mult_if.rdy <= 1;
+        i_p_l <= i_p;
+        A <= 0;
+        B <= 0;
+        C <= 0;
+        D <= 0;
+        E <= 0;
+        o_val <= 0;
+        if (i_val && o_rdy) begin
+          state <= START;
+          o_rdy <= 0;
+          if (i_p.z == 0) begin
+            o_p <= i_p;
+            o_val <= 1;
+            state <= FINISHED;
+          end
+        end
+      end
+      // Just a big if tree where we issue equations if the required inputs
+      // are valid
+      {START}: begin
+        i_mult_if.rdy <= 1;
+
+        // Check any results from multiplier
+        if (i_mult_if.val && i_mult_if.rdy) begin
+          eq_val[i_mult_if.ctl[5:0]] <= 1;
+          case(i_mult_if.ctl[5:0]) inside
+            0: A <= i_mult_if.dat;
+            1: B <= i_mult_if.dat;
+            2: B <= i_mult_if.dat;
+            3: C <= i_mult_if.dat;
+            4: C <= i_mult_if.dat;
+            5: D <= i_mult_if.dat;
+            6: D <= i_mult_if.dat;
+            7: o_p.x <= i_mult_if.dat;
+            11: o_p.y <= i_mult_if.dat;
+            14: o_p.z <= i_mult_if.dat;
+            default: o_err <= 1;
+          endcase
+        end
+
+        // Issue new multiplies
+        if (~eq_wait[0]) begin              //0.    A = (i_p.y)^2 mod p
+          multiply(0, i_p_l.y, i_p_l.y);
+        end else
+        if (eq_val[0] && ~eq_wait[1]) begin //1.    B = (i_p.x)*A mod p [eq0]
+          multiply(1, i_p_l.x, A);
+        end else
+        if (eq_val[0] && ~eq_wait[3]) begin //3.    C = A^2 mod p [eq0]
+          multiply(3, A, A);
+        end else
+        if (~eq_wait[5]) begin              //5.    D = (i_p.x)^2 mod p
+          multiply(5, i_p_l.x, i_p_l.x);
+        end else
+        if (eq_val[5] && ~eq_wait[6]) begin //6.    D = 3*D mod p [eq5]
+          multiply(6, 256'd3, D);
+        end else
+        if (eq_val[6] && ~eq_wait[7]) begin //7.   (o_p.x) = D^2 mod p[eq6]
+          multiply(7, D, D);
+        end else
+        if (eq_val[10] && eq_val[6] && ~eq_wait[11]) begin //11.   (o_p.y) = D*(o_p.y) [eq10, eq6]
+          multiply(11, D, o_p.y);
+        end else
+        if (eq_val[13] && ~eq_wait[14]) begin //14.   (o_p.z) = o_p.z * i_p.z mod p [eq13]
+          multiply(14, i_p_l.z, o_p.z);
+        end else
+        if (eq_val[1] && ~eq_wait[2]) begin //2.    B = 4*B mod p [eq1]
+          multiply(2, B, 4);
+        end else
+        if (eq_val[3] && ~eq_wait[4]) begin //4.    C = C*8 mod p [eq3]
+          multiply(4, C, 8);
+        end
+
+
+        // Additions / subtractions we do in-module
+        if (eq_val[8] && eq_val[7] && ~eq_wait[9]) begin //9.   (o_p.x) = o_p.x - E mod p [eq8, eq7]
+          o_p.x <= subtract(9, o_p.x, E);
+        end
+
+        if (eq_val[9] && eq_val[2] && ~eq_wait[10]) begin //10.   (o_p.y) =  B - o_p.x mod p [eq9, eq2]
+          o_p.y <= subtract(10, B, o_p.x);
+        end
+
+        if (eq_val[2] && ~eq_wait[8]) begin //8.    E = 2*B mod p [eq2]
+          E <= addition(8, B, B);
+        end
+
+        if (~eq_wait[13]) begin            //13.   (o_p.z) = 2*(i_p.y) mod p
+          o_p.z <= addition(13, i_p_l.y, i_p_l.y);
+        end
+
+
+        if (eq_val[4] && eq_val[11] && ~eq_wait[12]) begin //12.   (o_p.y) = (o_p.y) - C mod p [eq11, eq4]
+          o_p.y <= subtract(12, o_p.y, C);
+        end
+
+        if (&eq_val) begin
+          state <= FINISHED;
+          o_val <= 1;
+        end
+      end
+      {FINISHED}: begin
+        if (o_val && i_rdy) begin
+          state <= IDLE;
+          o_val <= 0;
+          o_rdy <= 1;
+        end
+      end
+    endcase
+
+    if (o_err) begin
+      o_val <= 1;
+      if (o_val && i_rdy) begin
+        o_err <= 0;
+        state <= IDLE;
+      end
+    end
+
+  end
+end
+
+// Task for subtractions
+function logic [DAT_BITS-1:0] subtract(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
+  eq_wait[ctl] <= 1;
+  eq_val[ctl] <= 1;
+  return (a + (b > a ? P : 0) - b);
+endfunction
+
+// Task for addition
+function logic [DAT_BITS-1:0] addition(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
+  eq_wait[ctl] <= 1;
+  eq_val[ctl] <= 1;
+  return (a + b - (b + a > P ? P : 0));
+endfunction
+
+// Task for using multiplies
+task multiply(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
+  if (~o_mult_if.val || (o_mult_if.val && o_mult_if.rdy)) begin
+    o_mult_if.val <= 1;
+    o_mult_if.dat[0 +: DAT_BITS] <= a;
+    o_mult_if.dat[DAT_BITS +: DAT_BITS] <= b;
+    o_mult_if.ctl[5:0] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+
+
+endmodule
--- a/zcash_fpga/src/tb/bls12_381_fp_mult_mod_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fp_mult_mod_tb.sv
@ -16,8 +16,10 @@
 */
 `timescale 1ps/1ps

-module bls12_381_fp_mult_mod_tb ();
+module ec_fp_mult_mod_tb ();
 import common_pkg::*;
+import ec_fp_pkg::*;
+
 import bls12_381_pkg::*;

 localparam CLK_PERIOD = 100;
@ -53,8 +55,13 @@ logic [380:0] out_dat;

 always_comb out_if.dat = {3'd0, out_dat};

-bls12_381_fp_mult_mod
-bls12_381_fp_mult_mod (
+ec_fp_mult_mod #(
+  .DAT_BITS      ( 381              ),
+  .P             ( bls12_381_pkg::P ),
+  .KARATSUBA_LVL ( 3                ),
+  .CTL_BITS      ( 16               )
+)
+ec_fp_mult_mod (
  .i_clk( clk         ),
  .i_rst( rst         ),
  .i_ctl ( 16'd0       ),
--- a/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
@ -0,0 +1,150 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module ec_fp_point_dbl_tb ();
+
+import common_pkg::*;
+import secp256k1_pkg::*;
+import ec_fp_pkg::*;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+if_axi_stream #(.DAT_BYTS(256*3/8)) in_if(clk); // Point is X, Y, Z
+if_axi_stream #(.DAT_BYTS(256*3/8)) out_if(clk);
+
+if_axi_stream #(.DAT_BYTS(256*2/8), .CTL_BITS(8)) mult_in_if(clk);
+if_axi_stream #(.DAT_BYTS(256/8), .CTL_BITS(8)) mult_out_if(clk);
+
+jb_point_t in_p, out_p;
+
+always_comb begin
+  in_p = in_if.dat;
+  out_if.dat = out_p;
+end
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err)
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+
+ec_point_dbl #(
+  .DAT_BITS (256),
+  .P(secp256k1_pkg::p)
+)
+ec_point_dbl(
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input point
+  .i_p   ( in_p      ),
+  .i_val ( in_if.val ),
+  .o_rdy ( in_if.rdy ),
+  .o_p   ( out_p     ),
+  .o_err ( out_if.err ),
+  .i_rdy ( out_if.rdy ),
+  .o_val  ( out_if.val ) ,
+  .o_mult_if ( mult_in_if ),
+  .i_mult_if ( mult_out_if )
+);
+
+// Attach a mod reduction unit and multiply - mod unit
+// In full design these could use dedicated multipliers or be arbitrated
+secp256k1_mult_mod #(
+  .CTL_BITS ( 8 )
+)
+secp256k1_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_dat_a ( mult_in_if.dat[0 +: 256] ),
+  .i_dat_b ( mult_in_if.dat[256 +: 256] ),
+  .i_val ( mult_in_if.val ),
+  .i_err ( mult_in_if.err ),
+  .i_ctl ( mult_in_if.ctl ),
+  .i_cmd ( 1'd0           ),
+  .o_rdy ( mult_in_if.rdy ),
+  .o_dat ( mult_out_if.dat ),
+  .i_rdy ( mult_out_if.rdy ),
+  .o_val ( mult_out_if.val ),
+  .o_ctl ( mult_out_if.ctl ),
+  .o_err ( mult_out_if.err )
+);
+
+
+task test_0();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
+  logic [255:0] in_a, in_b;
+  jb_point_t p_in, p_exp, p_out;
+  $display("Running test_0...");
+  //p_in = {z:1, x:4, y:2};
+  //p_in = {z:10, x:64, y:23};
+  p_in = {x:secp256k1_pkg::Gx, y:secp256k1_pkg::Gx, z:1};
+  p_exp = dbl_jb_point(p_in, secp256k1_pkg::p);
+
+  fork
+    in_if.put_stream(p_in, 256*3/8);
+    out_if.get_stream(get_dat, get_len);
+  join
+
+  p_out = get_dat;
+
+  if (p_exp != p_out) begin
+    $display("Expected:");
+    print_jb_point(p_exp);
+    $display("Was:");
+    print_jb_point(p_out);
+    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
+  end
+
+  $display("test_0 PASSED");
+end
+endtask;
+
+function compare_point();
+
+endfunction
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+
+  test_0();
+
+  #1us $finish();
+end
+endmodule