Updates to logic for point multiplication in Fp2

2019-06-13 17:57:03 +08:00 · 2019-06-13 17:57:03 +08:00 · 8ebfdeb734
parent c3e527b87e
commit 8ebfdeb734
13 changed files with 757 additions and 781 deletions
--- a/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv
+++ b/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv
@ -1,6 +1,8 @@
 /*
  This provides the interface to perform
-  Fp^2 point logic (adding, subtracting, multiplication)
+  Fp^2 point logic (adding, subtracting, multiplication).
+
+  Improvements would be a control to bypass the Fp2 logic so we implement Fp faster.

  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation

@ -21,7 +23,8 @@
 module ec_fe2_arithmetic
 #(
  parameter type FE_TYPE,
-  parameter type FE2_TYPE
+  parameter type FE2_TYPE,
+  parameter CTL_BIT = 8        // From this bit 2 bits are used for control
 )(
  input i_clk, i_rst,
  // Interface to FE_TYPE multiplier (mod P)
@ -44,7 +47,6 @@ module ec_fe2_arithmetic
  if_axi_stream.sink   i_sub_fe2_if
 );

-localparam ADD_CTL_BIT = 8;
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16))   add_if_fe_i [2] (i_clk);
 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_if_fe_o [2] (i_clk);

@ -76,7 +78,7 @@ always_ff @ (posedge i_clk) begin
          add_if_fe_o[0].copy_if({i_add_fe2_if.dat[0 +: $bits(FE_TYPE)],
                                  i_add_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]},
                                  i_add_fe2_if.val, 1, 1, i_add_fe2_if.err, i_add_fe2_if.mod, i_add_fe2_if.ctl);
-          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          add_if_fe_o[0].ctl[CTL_BIT] <= 0;
          if (i_add_fe2_if.val) add_state <= ADD1;
        end
      end
@ -85,7 +87,7 @@ always_ff @ (posedge i_clk) begin
          add_if_fe_o[0].copy_if({i_add_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
                                i_add_fe2_if.dat[$bits(FE2_TYPE)+$bits(FE_TYPE) +: $bits(FE_TYPE)]},
                                i_add_fe2_if.val, 1, 1, i_add_fe2_if.err, i_add_fe2_if.mod, i_add_fe2_if.ctl);
-          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          add_if_fe_o[0].ctl[CTL_BIT] <= 1;
          if (i_add_fe2_if.val) add_state <= ADD0;
        end
      end
@ -94,7 +96,7 @@ always_ff @ (posedge i_clk) begin
    // One process to assign outputs
    if (~o_add_fe2_if.val || (o_add_fe2_if.val && o_add_fe2_if.rdy)) begin
      o_add_fe2_if.ctl <= add_if_fe_i[0].ctl;
-      if (add_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+      if (add_if_fe_i[0].ctl[CTL_BIT] == 0) begin
        if (add_if_fe_i[0].val)
          o_add_fe2_if.dat[0 +: $bits(FE_TYPE)] <= add_if_fe_i[0].dat;
      end else begin
@ -129,7 +131,7 @@ always_ff @ (posedge i_clk) begin
          sub_if_fe_o[0].copy_if({i_sub_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)],
                                  i_sub_fe2_if.dat[0 +: $bits(FE_TYPE)]},
                                  i_sub_fe2_if.val, 1, 1, i_sub_fe2_if.err, i_sub_fe2_if.mod, i_sub_fe2_if.ctl);
-          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          sub_if_fe_o[0].ctl[CTL_BIT] <= 0;
          if (i_sub_fe2_if.val) sub_state <= SUB1;
        end
      end
@ -138,7 +140,7 @@ always_ff @ (posedge i_clk) begin
          sub_if_fe_o[0].copy_if({i_sub_fe2_if.dat[$bits(FE_TYPE) + $bits(FE2_TYPE) +: $bits(FE_TYPE)],
                                  i_sub_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)]},
                                  i_sub_fe2_if.val, 1, 1, i_sub_fe2_if.err, i_sub_fe2_if.mod, i_sub_fe2_if.ctl);
-          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          sub_if_fe_o[0].ctl[CTL_BIT] <= 1;
          if (i_sub_fe2_if.val) sub_state <= SUB0;
        end
      end
@ -147,7 +149,7 @@ always_ff @ (posedge i_clk) begin
    // One process to assign outputs
    if (~o_sub_fe2_if.val || (o_sub_fe2_if.val && o_sub_fe2_if.rdy)) begin
      o_sub_fe2_if.ctl <= sub_if_fe_i[0].ctl;
-      if (sub_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+      if (sub_if_fe_i[0].ctl[CTL_BIT] == 0) begin
        if (sub_if_fe_i[0].val)
          o_sub_fe2_if.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[0].dat;
      end else begin
@ -165,7 +167,7 @@ logic [1:0] add_sub_val;
 always_comb begin
  mul_if_fe2_i.rdy = mul_state == MUL3 && (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy));

-  i_mul_fe_if.rdy = (i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 0 || i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 1) ?
+  i_mul_fe_if.rdy = (i_mul_fe_if.ctl[CTL_BIT +: 2] == 0 || i_mul_fe_if.ctl[CTL_BIT +: 2] == 1) ?
                  (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) :
                  (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy));

@ -202,28 +204,28 @@ always_ff @ (posedge i_clk) begin
          o_mul_fe_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]},
                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
-          o_mul_fe_if.ctl[ADD_CTL_BIT +: 2] <= 0;
+          o_mul_fe_if.ctl[CTL_BIT +: 2] <= 0;
          if (mul_if_fe2_i.val) mul_state <= MUL1;
        end
        MUL1: begin
          o_mul_fe_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]},
                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
-          o_mul_fe_if.ctl[ADD_CTL_BIT +: 2] <= 1;
+          o_mul_fe_if.ctl[CTL_BIT +: 2] <= 1;
          if (mul_if_fe2_i.val) mul_state <= MUL2;
        end
        MUL2: begin
          o_mul_fe_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]},
                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
-          o_mul_fe_if.ctl[ADD_CTL_BIT +: 2] <= 2;
+          o_mul_fe_if.ctl[CTL_BIT +: 2] <= 2;
          if (mul_if_fe2_i.val) mul_state <= MUL3;
        end
        MUL3: begin
          o_mul_fe_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]},
                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
-          o_mul_fe_if.ctl[ADD_CTL_BIT +: 2] <= 3;
+          o_mul_fe_if.ctl[CTL_BIT +: 2] <= 3;
          if (mul_if_fe2_i.val) mul_state <= MUL0;
        end
      endcase
@ -231,10 +233,10 @@ always_ff @ (posedge i_clk) begin

    // Process multiplications and do subtraction
    if (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) begin
-      if (i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 0) begin
+      if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 0) begin
        if (i_mul_fe_if.val) sub_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
      end
-      if (i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 1) begin
+      if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 1) begin
        sub_if_fe_o[1].val <= i_mul_fe_if.val;
        sub_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
      end
@ -243,10 +245,10 @@ always_ff @ (posedge i_clk) begin

    // Process multiplications and do addition
    if (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy)) begin
-      if (i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 2) begin
+      if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 2) begin
        if (i_mul_fe_if.val) add_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
      end
-      if (i_mul_fe_if.ctl[ADD_CTL_BIT +: 2] == 3) begin
+      if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 3) begin
        add_if_fe_o[1].val <= i_mul_fe_if.val;
        add_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
      end
--- a/ip_cores/ec/src/rtl/ec_fp_mult_mod.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_mult_mod.sv
@ -27,17 +27,8 @@ module ec_fp_mult_mod #(
  parameter                CTL_BITS = 16
 )(
  input i_clk, i_rst,
-  // Input value
-  input [DAT_BITS-1:0] i_dat_a,
-  input [DAT_BITS-1:0] i_dat_b,
-  input                i_val,
-  input [CTL_BITS-1:0] i_ctl,
-  output logic         o_rdy,
-  // output
-  output logic [DAT_BITS-1:0] o_dat,
-  output logic [CTL_BITS-1:0] o_ctl,
-  input                       i_rdy,
-  output logic                o_val
+  if_axi_stream.sink   i_mul,
+  if_axi_stream.source o_mul
 );

 // The reduction mod takes DAT_BITS + 1 bits, but we also need to make sure we are a multiple of KARATSUBA_LVL*2
@ -54,12 +45,12 @@ karatsuba_ofman_mult # (
 karatsuba_ofman_mult_0 (
  .i_clk  ( i_clk ),
  .i_rst  ( i_rst ),
-  .i_ctl  ( i_ctl ),
-  .i_dat_a( {3'd0, i_dat_a}  ),
-  .i_dat_b( {3'd0, i_dat_b} ),
-  .i_val  ( i_val ),
-  .o_rdy  ( o_rdy ),
-  .o_dat  ( mult_if[0].dat ),
+  .i_ctl  ( i_mul.ctl ),
+  .i_dat_a( {{(MLT_BITS-DAT_BITS){1'd0}}, i_mul.dat[0 +: DAT_BITS]}  ),
+  .i_dat_b( {{(MLT_BITS-DAT_BITS){1'd0}}, i_mul.dat[DAT_BITS +: DAT_BITS]} ),
+  .i_val  ( i_mul.val ),
+  .o_rdy  ( i_mul.rdy ),
+  .o_dat  ( mult_if[0].dat  ),
  .o_val  ( mult_if[0].val ),
  .i_rdy  ( mult_if[0].rdy ),
  .o_ctl  ( mult_if[0].ctl )
@ -114,11 +105,11 @@ barret_mod_pipe (
  .i_dat ( mult_if[0].dat ),
  .i_val ( mult_if[0].val ),
  .i_ctl ( mult_if[0].ctl ),
-  .o_ctl ( o_ctl ),
+  .o_ctl ( o_mul.ctl ),
  .o_rdy ( mult_if[0].rdy ),
-  .o_dat ( o_dat ),
-  .o_val ( o_val ),
-  .i_rdy ( i_rdy ),
+  .o_dat ( o_mul.dat ),
+  .o_val ( o_mul.val ),
+  .i_rdy ( o_mul.rdy ),
  .o_mult_if_0 ( mult_if[1]  ),
  .i_mult_if_0 ( mult_if[2] ),
  .o_mult_if_1 ( mult_if[3]  ),
--- a/ip_cores/ec/src/rtl/ec_fp_point_mult.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_point_mult.sv
@ -1,394 +0,0 @@
-/*
-  This performs point multiplication. We use the standard double
-  and add algorithm.
-
-  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <https://www.gnu.org/licenses/>.
-*/
-
-module ec_fp_point_mult
-#(
-  parameter      P,
-  parameter type POINT_TYPE,
-  parameter      DAT_BITS = $clog2(P),
-  parameter      RESOURCE_SHARE = "NO"
-)(
-  input i_clk, i_rst,
-  // Input point and value to multiply
-  input POINT_TYPE           i_p,
-  input logic [DAT_BITS-1:0] i_k,
-  input logic                i_val,
-  output logic               o_rdy,
-  // Output point
-  output POINT_TYPE o_p,
-  input logic       i_rdy,
-  output logic      o_val,
-  output logic      o_err,
-  // Interface to shared logic (mod p) (if RESOURCE_SHARE == "YES")
-  if_axi_stream.source o_mult_if,
-  if_axi_stream.sink   i_mult_if,
-  if_axi_stream.source o_add_if,
-  if_axi_stream.sink   i_add_if,
-  if_axi_stream.source o_sub_if,
-  if_axi_stream.sink   i_sub_if,
-  // We provide another input so that the final point addition can be done
-  input POINT_TYPE i_p2,
-  input            i_p2_val
-);
-
-// [0] is connection from/to dbl block, [1] is add block, [2] is arbitrated value
-if_axi_stream #(.DAT_BITS(DAT_BITS*2), .CTL_BITS(16)) mult_in_if [2:0] (i_clk);
-if_axi_stream #(.DAT_BITS(DAT_BITS), .CTL_BITS(16)) mult_out_if [2:0] (i_clk);
-
-if_axi_stream #(.DAT_BITS(DAT_BITS*2), .CTL_BITS(16)) add_in_if [2:0] (i_clk);
-if_axi_stream #(.DAT_BITS(DAT_BITS), .CTL_BITS(16)) add_out_if [2:0] (i_clk);
-
-if_axi_stream #(.DAT_BITS(DAT_BITS*2), .CTL_BITS(16)) sub_in_if [2:0] (i_clk);
-if_axi_stream #(.DAT_BITS(DAT_BITS), .CTL_BITS(16)) sub_out_if [2:0] (i_clk);
-
-logic [DAT_BITS-1:0] k_l;
-POINT_TYPE p_n, p_q, p_dbl, p_add;
-logic p_dbl_in_val, p_dbl_in_rdy, p_dbl_out_err, p_dbl_out_val, p_dbl_out_rdy, p_dbl_done;
-logic p_add_in_val, p_add_in_rdy, p_add_out_err, p_add_out_val, p_add_out_rdy, p_add_done;
-logic special_dbl, lookahead_dbl;
-
-enum {IDLE, DOUBLE_ADD, ADD_ONLY, FINISHED} state;
-
-always_ff @ (posedge i_clk) begin
-  if (i_rst) begin
-    o_val <= 0;
-    o_err <= 0;
-    o_rdy <= 0;
-    k_l <= 0;
-    p_q <= 0;
-    p_dbl_in_val <= 0;
-    p_dbl_out_rdy <= 0;
-    p_add_in_val <= 0;
-    p_add_out_rdy <= 0;
-    state <= IDLE;
-    o_p <= 0;
-    p_n <= 0;
-    p_dbl_done <= 0;
-    p_add_done <= 0;
-    special_dbl <= 0;
-    lookahead_dbl <= 0;
-  end else begin
-
-    case (state)
-      {IDLE}: begin
-        p_dbl_out_rdy <= 1;
-        p_add_out_rdy <= 1;
-        p_dbl_done <= 1;
-        p_add_done <= 1;
-        special_dbl <= 0;
-        lookahead_dbl <= 0;
-        o_rdy <= 1;
-        o_err <= 0;
-        p_q <= 0;  // p_q starts at 0
-        p_n <= i_p;
-        k_l <= i_k;
-        if (o_rdy && i_val) begin
-          o_rdy <= 0;
-          state <= DOUBLE_ADD;
-        end
-        if (o_rdy && i_p2_val) begin
-          o_rdy <= 0;
-          p_n <= i_p;
-          p_q <= i_p2;
-          state <= ADD_ONLY;
-          // Check for special cases to determine double or add
-          if (i_p.x == i_p2.x && i_p.y == i_p2.y) begin
-            p_dbl_in_val <= 1;
-          end else begin
-            p_add_in_val <= 1;
-          end
-        end
-
-      end
-      {DOUBLE_ADD}: begin
-        p_dbl_in_val <= (p_dbl_in_val && p_dbl_in_rdy) ? 0 : p_dbl_in_val;
-        p_add_in_val <= (p_add_in_val && p_add_in_rdy) ? 0 : p_add_in_val;
-        if (p_dbl_out_val && p_dbl_out_rdy) begin
-          p_dbl_done <= 1;
-          if (special_dbl) begin
-            p_q <= p_dbl;
-            special_dbl <= 0;
-          end
-          p_n <= p_dbl;
-          // We can look ahead and start the next double
-          if ((k_l >> 1) != 0 && ~lookahead_dbl && ~p_add_done) begin
-            p_dbl_in_val <= 1;
-            lookahead_dbl <= 1;
-            p_dbl_out_rdy <= 0; // Want to make sure we don't output while still waiting for add
-          end
-        end
-        if (p_add_out_val && p_add_out_rdy) begin
-          p_add_done <= 1;
-          p_q <= p_add;
-        end
-
-        // Update variables and issue new commands
-        if (p_add_done && p_dbl_done) begin
-          lookahead_dbl <= 0;
-          p_dbl_out_rdy <= 1;
-          p_add_done <= 0;
-          p_dbl_done <= 0;
-          k_l <= k_l >> 1;
-          if (k_l[0]) begin
-            p_add_in_val <= 1;
-            // Need to check for special case where the x, y point is the same
-            if (p_q.x == p_n.x && p_q.y == p_n.y) begin
-              special_dbl <= 1;
-              p_add_in_val <= 0;
-              p_add_done <= 1;
-            end
-          end else begin
-            p_add_done <= 1;
-          end
-
-          // Don't need to double on the final bit
-          if ((k_l >> 1) != 0)
-            p_dbl_in_val <= ~lookahead_dbl; // Don't do if we already started
-          else
-            p_dbl_done <= 1;
-
-          if (k_l == 0) begin
-            state <= FINISHED;
-            o_p <= p_add;
-            o_val <= 1;
-            p_dbl_in_val <= 0;
-            p_add_in_val <= 0;
-          end
-        end
-
-      end
-      {ADD_ONLY}: begin
-        p_dbl_in_val <= (p_dbl_in_val && p_dbl_in_rdy) ? 0 : p_dbl_in_val;
-        p_add_in_val <= (p_add_in_val && p_add_in_rdy) ? 0 : p_add_in_val;
-
-        if (p_dbl_out_val && p_dbl_out_rdy) begin
-          state <= FINISHED;
-          o_p <= p_dbl;
-          o_val <= 1;
-        end
-        if (p_add_out_val && p_add_out_rdy) begin
-          state <= FINISHED;
-          o_p <= p_add;
-          o_val <= 1;
-        end
-      end
-      {FINISHED}: begin
-        if (i_rdy && o_val) begin
-          o_val <= 0;
-          state <= IDLE;
-        end
-      end
-    endcase
-
-    if (p_dbl_out_err || p_add_out_err) begin
-      o_err <= 1;
-      o_val <= 1;
-      state <= FINISHED;
-    end
-
-  end
-end
-
-ec_fp_point_dbl #(
-  .P          ( P          ),
-  .POINT_TYPE ( POINT_TYPE )
-)
-ec_fp_point_dbl (
-  .i_clk ( i_clk ),
-  .i_rst ( i_rst ),
-  .i_p   ( p_n           ),
-  .i_val ( p_dbl_in_val  ),
-  .o_rdy ( p_dbl_in_rdy  ),
-  // Output point
-  .o_p   ( p_dbl         ),
-  .o_err ( p_dbl_out_err ),
-  .i_rdy ( p_dbl_out_rdy ),
-  .o_val ( p_dbl_out_val ),
-  // Interfaces to shared logic
-  .o_mult_if ( mult_in_if[0]  ),
-  .i_mult_if ( mult_out_if[0] ),
-  .o_add_if  ( add_in_if[0]   ),
-  .i_add_if  ( add_out_if[0]  ),
-  .o_sub_if  ( sub_in_if[0]   ),
-  .i_sub_if  ( sub_out_if[0]  )
-);
-
-ec_fp_point_add #(
-  .P          ( P          ),
-  .POINT_TYPE ( POINT_TYPE )
-)
-ec_fp_point_add (
-  .i_clk ( i_clk ),
-  .i_rst ( i_rst ),
-  // Input points
-  .i_p1  ( p_q           ),
-  .i_p2  ( p_n           ),
-  .i_val ( p_add_in_val  ),
-  .o_rdy ( p_add_in_rdy  ),
-  // Output point
-  .o_p   ( p_add         ),
-  .o_err ( p_add_out_err ),
-  .i_rdy ( p_add_out_rdy ),
-  .o_val ( p_add_out_val ),
-  // Interfaces to shared logic
-  .o_mult_if ( mult_in_if[1]  ),
-  .i_mult_if ( mult_out_if[1] ),
-  .o_add_if  ( add_in_if[1]   ),
-  .i_add_if  ( add_out_if[1]  ),
-  .o_sub_if  ( sub_in_if[1]   ),
-  .i_sub_if  ( sub_out_if[1]  )
-);
-
-resource_share # (
-  .NUM_IN ( 2 ),
-  .OVR_WRT_BIT ( 8 ),
-  .PIPELINE_IN ( 0 ),
-  .PIPELINE_OUT ( 0 )
-)
-resource_share_mult (
-  .i_clk ( i_clk ),
-  .i_rst ( i_rst ),
-  .i_axi ( mult_in_if[1:0]  ),
-  .o_res ( mult_in_if[2]    ),
-  .i_res ( mult_out_if[2]   ),
-  .o_axi ( mult_out_if[1:0] )
-);
-
-resource_share # (
-  .NUM_IN ( 2 ),
-  .OVR_WRT_BIT ( 8 ),
-  .PIPELINE_IN ( 0 ),
-  .PIPELINE_OUT ( 0 )
-)
-resource_share_add (
-  .i_clk ( i_clk ),
-  .i_rst ( i_rst ),
-  .i_axi ( add_in_if[1:0]  ),
-  .o_res ( add_in_if[2]    ),
-  .i_res ( add_out_if[2]   ),
-  .o_axi ( add_out_if[1:0] )
-);
-
-resource_share # (
-  .NUM_IN ( 2 ),
-  .OVR_WRT_BIT ( 8 ),
-  .PIPELINE_IN ( 0 ),
-  .PIPELINE_OUT ( 0 )
-)
-resource_share_sub (
-  .i_clk ( i_clk ),
-  .i_rst ( i_rst ),
-  .i_axi ( sub_in_if[1:0]  ),
-  .o_res ( sub_in_if[2]    ),
-  .i_res ( sub_out_if[2]   ),
-  .o_axi ( sub_out_if[1:0] )
-);
-generate
-  if (RESOURCE_SHARE == "YES") begin: RESOURCE_GEN
-    always_comb begin
-      o_mult_if.copy_if_comb(mult_in_if[2].dat, mult_in_if[2].val, 1, 1, 0, 0, mult_in_if[2].ctl);
-      mult_in_if[2].rdy = o_mult_if.rdy;
-      mult_out_if[2].copy_if_comb(i_mult_if.dat, i_mult_if.val, 1, 1, 0, 0, i_mult_if.ctl);
-      i_mult_if.rdy = mult_out_if[2].rdy;
-    end
-    always_comb begin
-      o_add_if.copy_if_comb(add_in_if[2].dat, add_in_if[2].val, 1, 1, 0, 0, add_in_if[2].ctl);
-      add_in_if[2].rdy = o_add_if.rdy;
-      add_out_if[2].copy_if_comb(i_add_if.dat, i_add_if.val, 1, 1, 0, 0, i_add_if.ctl);
-      i_add_if.rdy = add_out_if[2].rdy;
-    end
-    always_comb begin
-      o_sub_if.copy_if_comb(sub_in_if[2].dat, sub_in_if[2].val, 1, 1, 0, 0, sub_in_if[2].ctl);
-      sub_in_if[2].rdy = o_sub_if.rdy;
-      sub_out_if[2].copy_if_comb(i_sub_if.dat, i_sub_if.val, 1, 1, 0, 0, i_sub_if.ctl);
-      i_sub_if.rdy = sub_out_if[2].rdy;
-    end
-  end else begin
-    always_comb begin
-      o_mult_if.reset_source();
-      i_mult_if.rdy = 0;
-      o_add_if.reset_source();
-      i_add_if.rdy = 0;
-      o_sub_if.reset_source();
-      i_sub_if.rdy = 0;
-    end
-
-
-    ec_fp_mult_mod #(
-      .P             ( P  ),
-      .KARATSUBA_LVL ( 2  ),
-      .CTL_BITS      ( 16 )
-    )
-    ec_fp_mult_mod (
-      .i_clk( i_clk ),
-      .i_rst( i_rst ),
-      .i_dat_a ( mult_in_if[2].dat[0 +: DAT_BITS]        ),
-      .i_dat_b ( mult_in_if[2].dat[DAT_BITS +: DAT_BITS] ),
-      .i_val ( mult_in_if[2].val  ),
-      .i_ctl ( mult_in_if[2].ctl  ),
-      .o_rdy ( mult_in_if[2].rdy  ),
-      .o_dat ( mult_out_if[2].dat ),
-      .i_rdy ( mult_out_if[2].rdy ),
-      .o_val ( mult_out_if[2].val ),
-      .o_ctl ( mult_out_if[2].ctl )
-    );
-
-    adder_pipe # (
-      .P        ( P   ),
-      .CTL_BITS ( 16  ),
-      .LEVEL    ( 2   )
-    )
-    adder_pipe (
-      .i_clk( i_clk ),
-      .i_rst( i_rst ),
-      .i_dat_a ( add_in_if[2].dat[0 +: DAT_BITS]        ),
-      .i_dat_b ( add_in_if[2].dat[DAT_BITS +: DAT_BITS] ),
-      .i_ctl ( add_in_if[2].ctl ),
-      .i_val ( add_in_if[2].val  ),
-      .o_rdy ( add_in_if[2].rdy  ),
-      .o_dat ( add_out_if[2].dat ),
-      .o_val ( add_out_if[2].val ),
-      .o_ctl ( add_out_if[2].ctl ),
-      .i_rdy ( add_out_if[2].rdy )
-    );
-
-    subtractor_pipe # (
-      .P        ( P   ),
-      .CTL_BITS ( 16  ),
-      .LEVEL    ( 2   )
-    )
-    subtractor_pipe (
-      .i_clk( i_clk ),
-      .i_rst( i_rst ),
-      .i_dat_a ( sub_in_if[2].dat[0 +: DAT_BITS]        ),
-      .i_dat_b ( sub_in_if[2].dat[DAT_BITS +: DAT_BITS] ),
-      .i_ctl ( sub_in_if[2].ctl ),
-      .i_val ( sub_in_if[2].val  ),
-      .o_rdy ( sub_in_if[2].rdy  ),
-      .o_dat ( sub_out_if[2].dat ),
-      .o_val ( sub_out_if[2].val ),
-      .o_ctl ( sub_out_if[2].ctl ),
-      .i_rdy ( sub_out_if[2].rdy )
-    );
-
-  end
-endgenerate
-
-endmodule
--- a/ip_cores/ec/src/rtl/ec_point_mult.sv
+++ b/ip_cores/ec/src/rtl/ec_point_mult.sv
@ -0,0 +1,165 @@
+/*
+  This performs point multiplication. We use the standard double
+  and add algorithm, with some look ahead so we can perform
+  adds or doubles as early as possible.
+
+  Optimizations would be to use NAF.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_point_mult
+#(
+  parameter      P,
+  parameter type FP_TYPE,
+  parameter      DAT_BITS = $clog2(P)
+)(
+  input i_clk, i_rst,
+  // Input point and value to multiply in control
+  if_axi_stream.source o_pt_mult,
+  if_axi_stream.sink   i_pt_mult,
+  // Interface to point adder / doubler
+  if_axi_stream.source o_dbl,
+  if_axi_stream.sink   i_dbl,
+  if_axi_stream.source o_add,
+  if_axi_stream.sink   i_add
+);
+
+
+logic [DAT_BITS-1:0] k_l;
+logic p_dbl_done, p_add_done, special_dbl, lookahead_dbl;
+
+enum {IDLE, DOUBLE_ADD, FINISHED} state;
+
+always_comb begin
+  o_add.dat[$bits(FP_TYPE) +: $bits(FP_TYPE)] = o_dbl.dat;
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_dbl.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_add.val <= 0;
+    o_add.sop <= 1;
+    o_add.eop <= 1;
+    o_add.err <= 0;
+    o_add.ctl <= 0;
+    o_add.mod <= 0;
+    o_pt_mult.copy_if(0, 0, 1, 1, 0, 0, 0);
+    i_add.rdy <= 0;
+    i_dbl.rdy <= 0;
+    i_pt_mult.rdy <= 0;
+    k_l <= 0;
+    state <= IDLE;
+    p_dbl_done <= 0;
+    p_add_done <= 0;
+    special_dbl <= 0;
+    lookahead_dbl <= 0;
+  end else begin
+
+    case (state)
+      {IDLE}: begin
+        i_add.rdy <= 1;
+        i_dbl.rdy <= 1;
+        p_dbl_done <= 1;
+        p_add_done <= 1;
+        special_dbl <= 0;
+        lookahead_dbl <= 0;
+        i_pt_mult.rdy <= 1;
+        o_pt_mult.err <= 0;
+        o_add.dat[0 +: $bits(FP_TYPE)] <= 0;
+        o_dbl.dat <= i_pt_mult.dat;
+        k_l <= i_pt_mult.ctl;
+        if (i_pt_mult.rdy && i_pt_mult.val) begin
+          i_pt_mult.rdy <= 0;
+          state <= DOUBLE_ADD;
+        end
+      end
+      {DOUBLE_ADD}: begin
+        if (o_dbl.val && o_dbl.rdy) o_dbl.val <= 0;
+        if (o_add.val && o_add.rdy) o_add.val <= 0;
+
+        if (i_dbl.val && i_dbl.rdy) begin
+          p_dbl_done <= 1;
+          if (special_dbl) begin
+            o_add.dat[0 +: $bits(FP_TYPE)] <= i_dbl.dat;
+            special_dbl <= 0;
+          end
+
+          o_dbl.dat <= i_dbl.dat;
+          // We can look ahead and start the next double
+          if ((k_l >> 1) != 0 && ~lookahead_dbl && ~p_add_done) begin
+            o_dbl.val <= 1;
+            lookahead_dbl <= 1;
+            i_dbl.rdy <= 0; // Want to make sure we don't output while still waiting for add
+          end
+        end
+        if (i_add.val && i_add.rdy) begin
+          p_add_done <= 1;
+          o_add.dat[0 +: $bits(FP_TYPE)] <= i_add.dat;
+        end
+
+        // Update variables and issue new commands
+        if (p_add_done && p_dbl_done) begin
+          lookahead_dbl <= 0;
+          i_dbl.rdy <= 1;
+          p_add_done <= 0;
+          p_dbl_done <= 0;
+          k_l <= k_l >> 1;
+          if (k_l[0]) begin
+            o_add.val <= 1;
+            // Need to check for special case where the point coords are the same
+            if (o_add.dat[0 +: $bits(FP_TYPE)] == o_dbl.dat) begin
+              special_dbl <= 1;
+              o_add.val <= 0;
+              p_add_done <= 1;
+            end
+          end else begin
+            p_add_done <= 1;
+          end
+
+          // Don't need to double on the final bit
+          if ((k_l >> 1) != 0)
+            o_dbl.val <= ~lookahead_dbl; // Don't do if we already started
+          else
+            p_dbl_done <= 1;
+
+          if (k_l == 0) begin
+            state <= FINISHED;
+            o_pt_mult.dat <= i_add.dat;
+            o_pt_mult.val <= 1;
+            o_dbl.val <= 0;
+            o_add.val <= 0;
+          end
+        end
+
+      end
+      {FINISHED}: begin
+        if (o_pt_mult.rdy && o_pt_mult.val) begin
+          o_pt_mult.val <= 0;
+          state <= IDLE;
+        end
+      end
+    endcase
+
+    if (i_dbl.err || i_add.err) begin
+      o_pt_mult.err <= 1;
+      o_pt_mult.val <= 1;
+      state <= FINISHED;
+    end
+
+  end
+end
+endmodule
--- a/ip_cores/ec/src/tb/ec_fp2_point_add_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_add_tb.sv
@ -91,24 +91,6 @@ ec_fp2_point_add (
  .i_sub_if ( sub_out_if )
 );

-always_comb begin
-  mult_out_if.sop = 1;
-  mult_out_if.eop = 1;
-  mult_out_if.err = 0;
-  mult_out_if.mod = 1;
-
-  add_out_if.sop = 1;
-  add_out_if.eop = 1;
-  add_out_if.err = 0;
-  add_out_if.mod = 1;
-
-  sub_out_if.sop = 1;
-  sub_out_if.eop = 1;
-  sub_out_if.err = 0;
-  sub_out_if.mod = 1;
-end
-
-
 // Attach a mod reduction unit and multiply - mod unit
 ec_fp_mult_mod #(
  .P             ( P   ),
@ -118,15 +100,8 @@ ec_fp_mult_mod #(
 ec_fp_mult_mod (
  .i_clk( clk         ),
  .i_rst( rst         ),
-  .i_dat_a ( mult_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( mult_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_val ( mult_in_if.val ),
-  .i_ctl ( mult_in_if.ctl ),
-  .o_rdy ( mult_in_if.rdy ),
-  .o_dat ( mult_out_if.dat ),
-  .i_rdy ( mult_out_if.rdy ),
-  .o_val ( mult_out_if.val ),
-  .o_ctl ( mult_out_if.ctl )
+  .i_mul ( mult_in_if  ),
+  .o_mul ( mult_out_if )
 );

 adder_pipe # (
@ -138,15 +113,8 @@ adder_pipe # (
 adder_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( add_in_if.ctl ),
-  .i_val ( add_in_if.val  ),
-  .o_rdy ( add_in_if.rdy  ),
-  .o_dat ( add_out_if.dat ),
-  .o_val ( add_out_if.val ),
-  .o_ctl ( add_out_if.ctl ),
-  .i_rdy ( add_out_if.rdy )
+  .i_add ( add_in_if  ),
+  .o_add ( add_out_if )
 );

 subtractor_pipe # (
@ -158,15 +126,8 @@ subtractor_pipe # (
 subtractor_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( sub_in_if.ctl ),
-  .i_val ( sub_in_if.val  ),
-  .o_rdy ( sub_in_if.rdy  ),
-  .o_dat ( sub_out_if.dat ),
-  .o_val ( sub_out_if.val ),
-  .o_ctl ( sub_out_if.ctl ),
-  .i_rdy ( sub_out_if.rdy )
+  .i_sub ( sub_in_if  ),
+  .o_sub ( sub_out_if )
 );

 task test(input fp2_jb_point_t p1, p2, p_exp);
--- a/ip_cores/ec/src/tb/ec_fp2_point_dbl_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_dbl_tb.sv
@ -89,25 +89,6 @@ ec_fp2_point_dbl (
  .i_sub_if ( sub_out_if )
 );

-always_comb begin
-  mult_out_if.sop = 1;
-  mult_out_if.eop = 1;
-  mult_out_if.err = 0;
-  mult_out_if.mod = 1;
-
-  add_out_if.sop = 1;
-  add_out_if.eop = 1;
-  add_out_if.err = 0;
-  add_out_if.mod = 1;
-
-  sub_out_if.sop = 1;
-  sub_out_if.eop = 1;
-  sub_out_if.err = 0;
-  sub_out_if.mod = 1;
-end
-
-
-// Attach a mod reduction unit and multiply - mod unit
 ec_fp_mult_mod #(
  .P             ( P   ),
  .KARATSUBA_LVL ( 3   ),
@ -116,15 +97,8 @@ ec_fp_mult_mod #(
 ec_fp_mult_mod (
  .i_clk( clk         ),
  .i_rst( rst         ),
-  .i_dat_a ( mult_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( mult_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_val ( mult_in_if.val ),
-  .i_ctl ( mult_in_if.ctl ),
-  .o_rdy ( mult_in_if.rdy ),
-  .o_dat ( mult_out_if.dat ),
-  .i_rdy ( mult_out_if.rdy ),
-  .o_val ( mult_out_if.val ),
-  .o_ctl ( mult_out_if.ctl )
+  .i_mul ( mult_in_if  ),
+  .o_mul ( mult_out_if )
 );

 adder_pipe # (
@ -136,15 +110,8 @@ adder_pipe # (
 adder_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( add_in_if.ctl ),
-  .i_val ( add_in_if.val  ),
-  .o_rdy ( add_in_if.rdy  ),
-  .o_dat ( add_out_if.dat ),
-  .o_val ( add_out_if.val ),
-  .o_ctl ( add_out_if.ctl ),
-  .i_rdy ( add_out_if.rdy )
+  .i_add ( add_in_if  ),
+  .o_add ( add_out_if )
 );

 subtractor_pipe # (
@ -156,15 +123,8 @@ subtractor_pipe # (
 subtractor_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( sub_in_if.ctl ),
-  .i_val ( sub_in_if.val  ),
-  .o_rdy ( sub_in_if.rdy  ),
-  .o_dat ( sub_out_if.dat ),
-  .o_val ( sub_out_if.val ),
-  .o_ctl ( sub_out_if.ctl ),
-  .i_rdy ( sub_out_if.rdy )
+  .i_sub ( sub_in_if  ),
+  .o_sub ( sub_out_if )
 );

 task test(input fp2_jb_point_t p1, p_exp);
--- a/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
@ -0,0 +1,267 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module ec_fp2_point_mult_tb ();
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+localparam CLK_PERIOD = 1000;
+
+logic clk, rst;
+
+parameter type FP_TYPE  = bls12_381_pkg::fp2_jb_point_t;
+parameter type FE_TYPE  = bls12_381_pkg::fe_t;
+parameter type FE2_TYPE = bls12_381_pkg::fe2_t;
+parameter KEY_BITS      = bls12_381_pkg::DAT_BITS;
+parameter P             = bls12_381_pkg::P;
+
+`define MULT_FUNC(K, IN_POINT) fp2_point_mult(K, IN_POINT);
+`define PRINT_FUNC(IN_POINT)   print_fp2_jb_point(IN_POINT);
+`define G_POINT                bls12_381_pkg::g2_point
+
+if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8), .CTL_BITS(KEY_BITS)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8)) out_if(clk);
+
+
+
+if_axi_stream #(.DAT_BITS(2*$bits(FP_TYPE))) add_i_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) add_o_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_i_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_o_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) mult_in_if [2:0] (clk) ;
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) mult_out_if [2:0](clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_in_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) add_out_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) sub_in_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) sub_out_if [2:0] (clk);
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #(CLK_PERIOD/2) clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err) begin
+    out_if.rdy = 1;
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+  end
+
+ec_point_mult #(
+  .P       ( P ),
+  .FP_TYPE ( FP_TYPE )
+)
+ec_point_mult (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .o_pt_mult ( out_if ),
+  .i_pt_mult ( in_if  ),
+  // Interface to point adder / doubler
+  .o_dbl ( dbl_i_if ),
+  .i_dbl ( dbl_o_if ),
+  .o_add ( add_i_if ),
+  .i_add ( add_o_if )
+);
+
+ec_fp2_point_add #(
+  .FP2_TYPE ( FP_TYPE  ),
+  .FE_TYPE  ( FE_TYPE  ),
+  .FE2_TYPE ( FE2_TYPE )
+)
+ec_fp2_point_add (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input points
+  .i_p1  ( add_i_if.dat[0 +: $bits(FP_TYPE)]              ),
+  .i_p2  ( add_i_if.dat[$bits(FP_TYPE) +: $bits(FP_TYPE)] ),
+  .i_val ( add_i_if.val ),
+  .o_rdy ( add_i_if.rdy ),
+  .o_p   ( add_o_if.dat ),
+  .o_err ( add_o_if.err ),
+  .i_rdy ( add_o_if.rdy ),
+  .o_val ( add_o_if.val ) ,
+  .o_mul_if ( mult_in_if[0] ),
+  .i_mul_if ( mult_out_if[0] ),
+  .o_add_if ( add_in_if[0] ),
+  .i_add_if ( add_out_if[0] ),
+  .o_sub_if ( sub_in_if[0] ),
+  .i_sub_if ( sub_out_if[0] )
+);
+
+ec_fp2_point_dbl #(
+ .FP2_TYPE ( FP_TYPE  ),
+ .FE_TYPE  ( FE_TYPE  ),
+ .FE2_TYPE ( FE2_TYPE )
+)
+ec_fp2_point_dbl (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_p  ( dbl_i_if.dat),
+  .i_val ( dbl_i_if.val ),
+  .o_rdy ( dbl_i_if.rdy ),
+  .o_p   ( dbl_o_if.dat ),
+  .o_err ( dbl_o_if.err ),
+  .i_rdy ( dbl_o_if.rdy ),
+  .o_val ( dbl_o_if.val ) ,
+  .o_mul_if ( mult_in_if[1] ),
+  .i_mul_if ( mult_out_if[1] ),
+  .o_add_if ( add_in_if[1] ),
+  .i_add_if ( add_out_if[1] ),
+  .o_sub_if ( sub_in_if[1] ),
+  .i_sub_if ( sub_out_if[1] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_mul (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( mult_in_if[1:0]  ),
+  .o_res ( mult_in_if[2]    ),
+  .i_res ( mult_out_if[2]   ),
+  .o_axi ( mult_out_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_sub (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( sub_in_if[1:0] ),
+  .o_res ( sub_in_if[2] ),
+  .i_res ( sub_out_if[2] ),
+  .o_axi ( sub_out_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_add (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( add_in_if[1:0] ),
+  .o_res ( add_in_if[2] ),
+  .i_res ( add_out_if[2] ),
+  .o_axi ( add_out_if[1:0] )
+);
+
+ec_fp_mult_mod #(
+  .P             ( P   ),
+  .KARATSUBA_LVL ( 3   ),
+  .CTL_BITS      ( 16  )
+)
+ec_fp_mult_mod (
+  .i_clk( clk         ),
+  .i_rst( rst         ),
+  .i_mul ( mult_in_if[2] ),
+  .o_mul ( mult_out_if[2] )
+);
+
+adder_pipe # (
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+adder_pipe (
+  .i_clk ( clk           ),
+  .i_rst ( rst           ),
+  .i_add ( add_in_if[2]  ),
+  .o_add ( add_out_if[2] )
+);
+
+subtractor_pipe # (
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+subtractor_pipe (
+  .i_clk ( clk           ),
+  .i_rst ( rst           ),
+  .i_sub ( sub_in_if[2]  ),
+  .o_sub ( sub_out_if[2] )
+);
+
+// Test a point
+task test(input logic [KEY_BITS-1:0] k);
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+  integer start_time, finish_time;
+  FP_TYPE  p_out, p_exp;
+  $display("Running test with k= %d", k);
+  p_exp = `MULT_FUNC(k, `G_POINT);
+  start_time = $time;
+  fork
+    in_if.put_stream(`G_POINT, ($bits(FP_TYPE)+7)/8, k);
+    out_if.get_stream(get_dat, get_len);
+  join
+  finish_time = $time;
+
+  p_out = get_dat;
+
+  $display("Expected:");
+  `PRINT_FUNC(p_exp);
+  $display("Was:");
+  `PRINT_FUNC(p_out);
+
+  if (p_exp != p_out) begin
+    $fatal(1, "%m %t ERROR: output was wrong", $time);
+  end
+
+  $display("test PASSED in %d clocks", (finish_time-start_time)/CLK_PERIOD);
+end
+endtask;
+
+logic [380:0] in_k;
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+   test(4);
+   in_k = P-1;
+   //test(381'haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);
+   //test(in_k);
+
+  #1us $finish();
+end
+endmodule
--- a/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
@ -97,17 +97,10 @@ ec_fp_mult_mod #(
  .CTL_BITS      ( 8   )
 )
 ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
-  .i_dat_a ( mult_in_if.dat[0 +: 381] ),
-  .i_dat_b ( mult_in_if.dat[381 +: 381] ),
-  .i_val ( mult_in_if.val ),
-  .i_ctl ( mult_in_if.ctl ),
-  .o_rdy ( mult_in_if.rdy ),
-  .o_dat ( mult_out_if.dat ),
-  .i_rdy ( mult_out_if.rdy ),
-  .o_val ( mult_out_if.val ),
-  .o_ctl ( mult_out_if.ctl )
+  .i_clk( clk          ),
+  .i_rst( rst          ),
+  .i_mul ( mult_in_if  ),
+  .o_mul ( mult_out_if )
 );

 adder_pipe # (
@ -119,15 +112,8 @@ adder_pipe # (
 adder_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( add_in_if.ctl ),
-  .i_val ( add_in_if.val  ),
-  .o_rdy ( add_in_if.rdy  ),
-  .o_dat ( add_out_if.dat ),
-  .o_val ( add_out_if.val ),
-  .o_ctl ( add_out_if.ctl ),
-  .i_rdy ( add_out_if.rdy )
+  .i_add ( add_in_if  ),
+  .o_add ( add_out_if )
 );

 subtractor_pipe # (
@ -139,33 +125,10 @@ subtractor_pipe # (
 subtractor_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( sub_in_if.ctl ),
-  .i_val ( sub_in_if.val  ),
-  .o_rdy ( sub_in_if.rdy  ),
-  .o_dat ( sub_out_if.dat ),
-  .o_val ( sub_out_if.val ),
-  .o_ctl ( sub_out_if.ctl ),
-  .i_rdy ( sub_out_if.rdy )
+  .i_sub ( sub_in_if  ),
+  .o_sub ( sub_out_if )
 );

-always_comb begin
-  mult_out_if.sop = 1;
-  mult_out_if.eop = 1;
-  mult_out_if.err = 0;
-  mult_out_if.mod = 1;
-
-  add_out_if.sop = 1;
-  add_out_if.eop = 1;
-  add_out_if.err = 0;
-  add_out_if.mod = 1;
-
-  sub_out_if.sop = 1;
-  sub_out_if.eop = 1;
-  sub_out_if.err = 0;
-  sub_out_if.mod = 1;
-end

 task test_0();
 begin
--- a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
@ -24,18 +24,31 @@ localparam CLK_PERIOD = 1000;

 logic clk, rst;

-if_axi_stream #(.DAT_BYTS(384*3/8)) in_if(clk);
-if_axi_stream #(.DAT_BYTS(384*3/8)) out_if(clk);
+parameter type FP_TYPE = bls12_381_pkg::jb_point_t;
+parameter type FE_TYPE = bls12_381_pkg::fe_t;
+parameter KEY_BITS     = bls12_381_pkg::DAT_BITS;
+parameter P            = bls12_381_pkg::P;

-if_axi_stream #(.DAT_BYTS(384*2/8), .CTL_BITS(16)) mult_in_if(clk);
-if_axi_stream #(.DAT_BYTS(384/8), .CTL_BITS(16)) mult_out_if(clk);
-if_axi_stream #(.DAT_BYTS(384*2/8), .CTL_BITS(16)) add_in_if(clk);
-if_axi_stream #(.DAT_BYTS(384/8), .CTL_BITS(16)) add_out_if(clk);
-if_axi_stream #(.DAT_BYTS(384*2/8), .CTL_BITS(16)) sub_in_if(clk);
-if_axi_stream #(.DAT_BYTS(384/8), .CTL_BITS(16)) sub_out_if(clk);
+`define MULT_FUNC(K, IN_POINT) point_mult(K, IN_POINT);
+`define PRINT_FUNC(IN_POINT)   print_jb_point(IN_POINT);
+`define G_POINT                bls12_381_pkg::g_point
+
+if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8), .CTL_BITS(KEY_BITS)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8)) out_if(clk);


-logic [DAT_BITS-1:0] k_in;
+
+if_axi_stream #(.DAT_BITS(2*$bits(FP_TYPE))) add_i_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) add_o_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_i_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_o_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) mult_in_if [2:0] (clk) ;
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) mult_out_if [2:0](clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_in_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) add_out_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) sub_in_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) sub_out_if [2:0] (clk);

 initial begin
  rst = 0;
@ -61,78 +74,172 @@ always_ff @ (posedge clk)
    $error(1, "%m %t ERROR: output .err asserted", $time);
  end

-always_comb begin
-  mult_out_if.sop = 1;
-  mult_out_if.eop = 1;
-  mult_out_if.val = 0;
-  mult_out_if.mod = 0;
-  mult_in_if.rdy = 1;
-  add_out_if.sop = 1;
-  add_out_if.eop = 1;
-  add_out_if.val = 0;
-  add_out_if.mod = 0;
-  add_in_if.rdy = 1;
-  sub_out_if.sop = 1;
-  sub_out_if.eop = 1;
-  sub_out_if.val = 0;
-  sub_out_if.mod = 0;
-  sub_in_if.rdy = 1;
-end
-
-
-ec_fp_point_mult #(
-  .P          ( P ),
-  .POINT_TYPE ( jb_point_t ),
-  .DAT_BITS   ( DAT_BITS   ),
-  .RESOURCE_SHARE ("NO")
+ec_point_mult #(
+  .P       ( P ),
+  .FP_TYPE ( FP_TYPE )
 )
-ec_fp_point_mult (
+ec_point_mult (
  .i_clk ( clk ),
  .i_rst ( rst ),
-  .i_p   ( in_if.dat  ),
-  .i_k   ( k_in       ),
-  .i_val ( in_if.val  ),
-  .o_rdy ( in_if.rdy  ),
-  .o_p   ( out_if.dat ),
-  .i_rdy ( out_if.rdy ),
-  .o_val ( out_if.val ),
-  .o_err ( out_if.err ),
-  .o_mult_if ( mult_in_if ),
-  .i_mult_if ( mult_out_if ),
-  .o_add_if ( add_in_if ),
-  .i_add_if ( add_out_if ),
-  .o_sub_if ( sub_in_if ),
-  .i_sub_if ( sub_out_if ),  
-  .i_p2_val ( 0),
-  .i_p2 ( 0 )
+  .o_pt_mult ( out_if ),
+  .i_pt_mult ( in_if  ),
+  // Interface to point adder / doubler
+  .o_dbl ( dbl_i_if ),
+  .i_dbl ( dbl_o_if ),
+  .o_add ( add_i_if ),
+  .i_add ( add_o_if )
 );

+ec_point_add #(
+  .FP_TYPE ( FP_TYPE ),
+  .FE_TYPE ( FE_TYPE )
+)
+ec_point_add (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input points
+  .i_p1  ( add_i_if.dat[0 +: $bits(FP_TYPE)]              ),
+  .i_p2  ( add_i_if.dat[$bits(FP_TYPE) +: $bits(FP_TYPE)] ),
+  .i_val ( add_i_if.val ),
+  .o_rdy ( add_i_if.rdy ),
+  .o_p   ( add_o_if.dat ),
+  .o_err ( add_o_if.err ),
+  .i_rdy ( add_o_if.rdy ),
+  .o_val ( add_o_if.val ) ,
+  .o_mul_if ( mult_in_if[0] ),
+  .i_mul_if ( mult_out_if[0] ),
+  .o_add_if ( add_in_if[0] ),
+  .i_add_if ( add_out_if[0] ),
+  .o_sub_if ( sub_in_if[0] ),
+  .i_sub_if ( sub_out_if[0] )
+);
+
+ec_point_dbl #(
+  .FP_TYPE ( FP_TYPE ),
+  .FE_TYPE ( FE_TYPE )
+)
+ec_point_dbl (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_p  ( dbl_i_if.dat),
+  .i_val ( dbl_i_if.val ),
+  .o_rdy ( dbl_i_if.rdy ),
+  .o_p   ( dbl_o_if.dat ),
+  .o_err ( dbl_o_if.err ),
+  .i_rdy ( dbl_o_if.rdy ),
+  .o_val ( dbl_o_if.val ) ,
+  .o_mul_if ( mult_in_if[1] ),
+  .i_mul_if ( mult_out_if[1] ),
+  .o_add_if ( add_in_if[1] ),
+  .i_add_if ( add_out_if[1] ),
+  .o_sub_if ( sub_in_if[1] ),
+  .i_sub_if ( sub_out_if[1] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_mul (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( mult_in_if[1:0]  ),
+  .o_res ( mult_in_if[2]    ),
+  .i_res ( mult_out_if[2]   ),
+  .o_axi ( mult_out_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_sub (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( sub_in_if[1:0] ),
+  .o_res ( sub_in_if[2] ),
+  .i_res ( sub_out_if[2] ),
+  .o_axi ( sub_out_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 12 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_add (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( add_in_if[1:0] ),
+  .o_res ( add_in_if[2] ),
+  .i_res ( add_out_if[2] ),
+  .o_axi ( add_out_if[1:0] )
+);
+
+ec_fp_mult_mod #(
+  .P             ( P   ),
+  .KARATSUBA_LVL ( 3   ),
+  .CTL_BITS      ( 16  )
+)
+ec_fp_mult_mod (
+  .i_clk( clk         ),
+  .i_rst( rst         ),
+  .i_mul ( mult_in_if[2] ),
+  .o_mul ( mult_out_if[2] )
+);
+
+adder_pipe # (
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+adder_pipe (
+  .i_clk ( clk           ),
+  .i_rst ( rst           ),
+  .i_add ( add_in_if[2]  ),
+  .o_add ( add_out_if[2] )
+);
+
+subtractor_pipe # (
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+subtractor_pipe (
+  .i_clk ( clk           ),
+  .i_rst ( rst           ),
+  .i_sub ( sub_in_if[2]  ),
+  .o_sub ( sub_out_if[2] )
+);

 // Test a point
-task test(input logic [DAT_BITS-1:0] k);
+task test(input logic [KEY_BITS-1:0] k);
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
  integer start_time, finish_time;
-  jb_point_t  p_out, p_exp;
-  $display("Running test with k= %d ...", k);
-  p_exp = point_mult(k, g_point);
-  k_in = k;
+  FP_TYPE  p_out, p_exp;
+  $display("Running test with k= %d", k);
+  p_exp = `MULT_FUNC(k, `G_POINT);
  start_time = $time;
  fork
-    in_if.put_stream(g_point, 384*3/8);
+    in_if.put_stream(`G_POINT, ($bits(FP_TYPE)+7)/8, k);
    out_if.get_stream(get_dat, get_len);
  join
  finish_time = $time;

  p_out = get_dat;

-
  $display("Expected:");
-  print_jb_point(p_exp);
+  `PRINT_FUNC(p_exp);
  $display("Was:");
-  print_jb_point(p_out);
-  
+  `PRINT_FUNC(p_out);
+
  if (p_exp != p_out) begin
    $fatal(1, "%m %t ERROR: output was wrong", $time);
  end
@ -141,19 +248,17 @@ begin
 end
 endtask;

-jb_point_t point;
+logic [380:0] in_k;

 initial begin
  out_if.rdy = 0;
  in_if.val = 0;
  #(40*CLK_PERIOD);

-  /*test(381'h1);
-  test(381'h5);
-  test(381'h10);*/
-  //test(1 << 380);
-  test(381'h9f5193de96ab6e65e7c7df8adcec4e82b971dd5f54d1c62103776d3eef0255ae346eba9e29eb08c3a957e9a53afc3ce);
-    
+   in_k = P-1;
+   test(381'haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);
+   test(in_k);
+   
  #1us $finish();
 end
 endmodule
--- a/ip_cores/ec/src/tb/ec_point_add_tb.sv
+++ b/ip_cores/ec/src/tb/ec_point_add_tb.sv
@ -90,22 +90,6 @@ ec_point_add (
  .i_sub_if ( sub_out_if )
 );

-always_comb begin
-  mult_out_if.sop = 1;
-  mult_out_if.eop = 1;
-  mult_out_if.err = 0;
-  mult_out_if.mod = 1;
-
-  add_out_if.sop = 1;
-  add_out_if.eop = 1;
-  add_out_if.err = 0;
-  add_out_if.mod = 1;
-
-  sub_out_if.sop = 1;
-  sub_out_if.eop = 1;
-  sub_out_if.err = 0;
-  sub_out_if.mod = 1;
-end


 // Attach a mod reduction unit and multiply - mod unit
@ -117,15 +101,8 @@ ec_fp_mult_mod #(
 ec_fp_mult_mod (
  .i_clk( clk         ),
  .i_rst( rst         ),
-  .i_dat_a ( mult_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( mult_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_val ( mult_in_if.val ),
-  .i_ctl ( mult_in_if.ctl ),
-  .o_rdy ( mult_in_if.rdy ),
-  .o_dat ( mult_out_if.dat ),
-  .i_rdy ( mult_out_if.rdy ),
-  .o_val ( mult_out_if.val ),
-  .o_ctl ( mult_out_if.ctl )
+  .i_mul ( mult_in_if ),
+  .o_mul ( mult_out_if )
 );

 adder_pipe # (
@ -137,15 +114,8 @@ adder_pipe # (
 adder_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( add_in_if.ctl ),
-  .i_val ( add_in_if.val  ),
-  .o_rdy ( add_in_if.rdy  ),
-  .o_dat ( add_out_if.dat ),
-  .o_val ( add_out_if.val ),
-  .o_ctl ( add_out_if.ctl ),
-  .i_rdy ( add_out_if.rdy )
+  .i_add ( add_in_if  ),
+  .o_add ( add_out_if )
 );

 subtractor_pipe # (
@ -157,15 +127,8 @@ subtractor_pipe # (
 subtractor_pipe (
  .i_clk ( clk        ),
  .i_rst ( rst        ),
-  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
-  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
-  .i_ctl ( sub_in_if.ctl ),
-  .i_val ( sub_in_if.val  ),
-  .o_rdy ( sub_in_if.rdy  ),
-  .o_dat ( sub_out_if.dat ),
-  .o_val ( sub_out_if.val ),
-  .o_ctl ( sub_out_if.ctl ),
-  .i_rdy ( sub_out_if.rdy )
+  .i_sub ( sub_in_if  ),
+  .o_sub ( sub_out_if )
 );

 task test();
@ -191,7 +154,7 @@ begin
  $display("Was:");
  print_jb_point(p_out);

-  if (p_exp != p_out) begin
+  if (~(p_exp == p_out)) begin
    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
  end

--- a/ip_cores/util/src/rtl/adder_pipe.sv
+++ b/ip_cores/util/src/rtl/adder_pipe.sv
@ -23,17 +23,10 @@ module adder_pipe # (
  parameter  CTL_BITS = 8,
  parameter  LEVEL = 1     // If LEVEL == 1 this is just an add with registered output
 ) (
-  input                       i_clk,
-  input                       i_rst,
-  input [BITS-1:0]            i_dat_a,
-  input [BITS-1:0]            i_dat_b,
-  input                       i_val,
-  input [CTL_BITS-1:0]        i_ctl,
-  input                       i_rdy,
-  output logic                o_rdy,
-  output logic                o_val,
-  output logic [CTL_BITS-1:0] o_ctl,
-  output logic [BITS-1:0]     o_dat
+  input                i_clk,
+  input                i_rst,
+  if_axi_stream.sink   i_add,
+  if_axi_stream.source o_add
 );

 // Internally we want to use a even divisor for BITS of BITS/LEVEL
@ -53,47 +46,43 @@ always_comb begin
  P_ = 0;
  P_ = P;
  carry_neg[0] = 0;
-  val[0] = i_val;
-  ctl[0] = i_ctl;
-  a[0] = i_dat_a;
-  b[0] = i_dat_b;
+  val[0] = i_add.val;
+  ctl[0] = i_add.ctl;
+  a[0] = 0;
+  a[0] = i_add.dat[0 +: BITS];
+  b[0] = 0;
+  b[0] = i_add.dat[BITS +: BITS];
  result0[0] = 0;
  result1[0] = 0;
-  o_val = val[LEVEL];
-  rdy[LEVEL] = i_rdy;
-  o_dat = carry_neg[LEVEL] ? result0[LEVEL] : result1[LEVEL];
-  o_ctl = ctl[LEVEL];
-  o_rdy = rdy[0];
-end
-
-always_comb begin
-
+  rdy[LEVEL] = o_add.rdy;
+  i_add.rdy = rdy[0];
+  o_add.copy_if_comb(carry_neg[LEVEL] ? result0[LEVEL] : result1[LEVEL], val[LEVEL], 1, 1, 1, 0, ctl[LEVEL]);
 end

 generate
 genvar g;
  for (g = 0; g < LEVEL; g++) begin: ADDER_GEN
-  
+
    logic [BITS_LEVEL:0] add_res0, add_res0_, add_res1;
    logic cn;

    always_comb begin
      rdy[g] = ~val[g+1] || (val[g+1] && rdy[g+1]);
-      add_res0 = a[g][g*BITS_LEVEL +: BITS_LEVEL] + 
-                 b[g][g*BITS_LEVEL +: BITS_LEVEL] + 
+      add_res0 = a[g][g*BITS_LEVEL +: BITS_LEVEL] +
+                 b[g][g*BITS_LEVEL +: BITS_LEVEL] +
                 result0[g][g*BITS_LEVEL];
-                 
-      add_res0_ = a[g][g*BITS_LEVEL +: BITS_LEVEL] + 
-                  b[g][g*BITS_LEVEL +: BITS_LEVEL] + 
+
+      add_res0_ = a[g][g*BITS_LEVEL +: BITS_LEVEL] +
+                  b[g][g*BITS_LEVEL +: BITS_LEVEL] +
                  result1[g][g*BITS_LEVEL];
-                 
+
      if (add_res0_ < (P_[g*BITS_LEVEL +: BITS_LEVEL] + carry_neg[g])) begin
        cn = 1;
        add_res1 = add_res0_ - P_[g*BITS_LEVEL +: BITS_LEVEL] + (1 << BITS_LEVEL) - carry_neg[g];
      end else begin
        cn = 0;
        add_res1 = add_res0_ - P_[g*BITS_LEVEL +: BITS_LEVEL] - carry_neg[g];
-      end             
+      end
    end

    always_ff @ (posedge i_clk) begin
@ -111,13 +100,13 @@ genvar g;
          ctl[g+1] <= ctl[g];
          a[g+1] <= a[g];
          b[g+1] <= b[g];
-          
+
          result0[g+1] <= result0[g];
          result0[g+1][g*BITS_LEVEL +: BITS_LEVEL + 1] <= add_res0;

          result1[g+1] <= result1[g];
          result1[g+1][g*BITS_LEVEL +: BITS_LEVEL + 1] <= add_res1;
-                    
+
          carry_neg[g+1] <= cn;
        end
      end
--- a/ip_cores/util/src/rtl/subtracter_pipe.sv
+++ b/ip_cores/util/src/rtl/subtracter_pipe.sv
@ -23,17 +23,10 @@ module subtractor_pipe # (
  parameter  CTL_BITS = 8,
  parameter  LEVEL = 1     // If LEVEL == 1 this is just an add with registered output
 ) (
-  input                       i_clk,
-  input                       i_rst,
-  input [BITS-1:0]            i_dat_a,
-  input [BITS-1:0]            i_dat_b,
-  input                       i_val,
-  input [CTL_BITS-1:0]        i_ctl,
-  input                       i_rdy,
-  output logic                o_rdy,
-  output logic                o_val,
-  output logic [CTL_BITS-1:0] o_ctl,
-  output logic [BITS-1:0]     o_dat
+  input                i_clk,
+  input                i_rst,
+  if_axi_stream.sink   i_sub,
+  if_axi_stream.source o_sub
 );

 // Internally we want to use a even divisor for BITS of BITS/LEVEL
@ -54,21 +47,18 @@ always_comb begin
  P_ = P;
  carry_neg0[0] = 0;
  carry_neg1[0] = 0;
-  val[0] = i_val;
-  ctl[0] = i_ctl;
-  a[0] = i_dat_a;
-  b[0] = i_dat_b;
+  val[0] = i_sub.val;
+  ctl[0] = i_sub.ctl;
+  a[0] = 0;
+  a[0] = i_sub.dat[0 +: BITS];
+  b[0] = 0;
+  b[0] = i_sub.dat[BITS +: BITS];
  result0[0] = 0;
  result1[0] = 0;
-  o_val = val[LEVEL];
-  rdy[LEVEL] = i_rdy;
-  o_dat = carry_neg1[LEVEL] ? result0[LEVEL] : result1[LEVEL];
-  o_ctl = ctl[LEVEL];
-  o_rdy = rdy[0];
-end
-
-always_comb begin
-
+  rdy[LEVEL] = o_sub.rdy;
+  i_sub.rdy = rdy[0];
+  o_sub.dat = carry_neg1[LEVEL] ? result0[LEVEL] : result1[LEVEL];
+  o_sub.copy_if_comb(carry_neg1[LEVEL] ? result0[LEVEL] : result1[LEVEL], val[LEVEL], 1, 1, 1, 0, ctl[LEVEL]);
 end

 generate
@ -80,10 +70,10 @@ genvar g;

    always_comb begin
      rdy[g] = ~val[g+1] || (val[g+1] && rdy[g+1]);
-      
+
      sub_res0_ = a[g][g*BITS_LEVEL +: BITS_LEVEL] + P_[g*BITS_LEVEL +: BITS_LEVEL] + result0[g][g*BITS_LEVEL];
      sub_res0__ = b[g][g*BITS_LEVEL +: BITS_LEVEL] + carry_neg0[g];
-      
+
      if (sub_res0_ < sub_res0__) begin
         cn0 = 1;
         sub_res0 = sub_res0_ - sub_res0__ + (1 << BITS_LEVEL);
@ -94,7 +84,7 @@ genvar g;

      sub_res1_ = a[g][g*BITS_LEVEL +: BITS_LEVEL] + result1[g][g*BITS_LEVEL];
      sub_res1__ = b[g][g*BITS_LEVEL +: BITS_LEVEL] + carry_neg1[g];
-      
+
      if (sub_res1_ < sub_res1__) begin
        cn1 = 1;
        sub_res1 = sub_res1_ - sub_res1__ + (1 << BITS_LEVEL);
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@ -44,7 +44,7 @@ package bls12_381_pkg;
  } jb_point_t;

  typedef struct packed {
-    fe_t c1, c0;   
+    fe_t c1, c0;
  } fe2_t;

  fe2_t G2x = '{c0:381'd352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
@ -136,40 +136,40 @@ package bls12_381_pkg;
     add_jb_point.y = fe_sub(A, add_jb_point.y);

   endfunction
-   
+
   function fe_t fe_add(fe_t a, b);
     logic [$bits(fe_t):0] a_, b_;
     a_ = a;
     b_ = b;
     fe_add = a_ + b_ >= P ? a_ + b_ - P : a_ + b_;
-   endfunction   
-   
+   endfunction
+
   function fe2_t fe2_add(fe2_t a, b);
     fe2_add.c0 = fe_add(a.c0,b.c0);
     fe2_add.c1 = fe_add(a.c1,b.c1);
   endfunction
-   
+
   function fe_t fe_sub(fe_t a, b);
     logic [$bits(fe_t):0] a_, b_;
     a_ = a;
     b_ = b;
     fe_sub = b_ > a_ ? a_- b_ + P : a_ - b_;
-   endfunction  
-   
+   endfunction
+
   function fe2_t fe2_sub(fe2_t a, b);
     fe2_sub.c0 = fe_sub(a.c0, b.c0);
     fe2_sub.c1 = fe_sub(a.c1, b.c1);
-   endfunction  
-  
+   endfunction
+
   function fe_t fe_mul(fe2_t a, b);
     fe_mul = (a * b) % P;
-   endfunction  
-    
+   endfunction
+
   function fe2_t fe2_mul(fe2_t a, b);
     fe2_mul.c0 = fe_sub(fe_mul(a.c0, b.c0), fe_mul(a.c1, b.c1));
     fe2_mul.c1 = fe_add(fe_mul(a.c0, b.c1), fe_mul(a.c1, b.c0));
-   endfunction  
-   
+   endfunction
+
      // Function to double point in Jacobian coordinates (for comparison in testbench)
   // Here a is 0, and we also mod the result
   function jb_point_t dbl_jb_point(input jb_point_t p);
@ -196,12 +196,12 @@ package bls12_381_pkg;
     dbl_jb_point.z = Z;
     return dbl_jb_point;
   endfunction
-   
+
   function fp2_jb_point_t dbl_fp2_jb_point(input fp2_jb_point_t p);
     fe2_t I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;

     if (p.z == 0) return p;
-  
+
     I_X = p.x;
     I_Y = p.y;
     I_Z = p.z;
@ -215,52 +215,52 @@ package bls12_381_pkg;
     Y = fe2_mul(D, fe2_sub(B, X));
     Y = fe2_sub(Y, C);
     Z = fe2_mul(fe2_mul(2, I_Y), I_Z);
-  
+
     dbl_fp2_jb_point.x = X;
     dbl_fp2_jb_point.y = Y;
     dbl_fp2_jb_point.z = Z;
     return dbl_fp2_jb_point;
-   endfunction 
-   
+   endfunction
+
  function fp2_jb_point_t add_fp2_jb_point(fp2_jb_point_t p1, p2);
    fe2_t A, U1, U2, S1, S2, H, H3, R;

    if (p1.z == 0) return p2;
    if (p2.z == 0) return p1;
-  
+
    if (p1.y == p2.y && p1.x == p2.x)
      return (dbl_fp2_jb_point(p1));
-  
+
    U1 = fe2_mul(p1.x, p2.z);
    U1 = fe2_mul(U1, p2.z);
-  
+
    U2 = fe2_mul(p2.x, p1.z);
    U2 = fe2_mul(U2, p1.z);
    S1 = fe2_mul(p1.y, p2.z);
    S1 = fe2_mul(fe2_mul(S1, p2.z), p2.z);
    S2 = fe2_mul(p2.y, p1.z);
    S2 = fe2_mul(fe2_mul(S2, p1.z), p1.z);
-  
+
    H = fe2_sub(U2, U1);
    R = fe2_sub(S2, S1);
    H3 = fe2_mul(fe2_mul(H, H), H);
    A = fe2_mul(fe2_mul(fe2_mul(2, U1), H), H);
-  
+
    add_fp2_jb_point.z = fe2_mul(fe2_mul(H, p1.z), p2.z);
    add_fp2_jb_point.x = fe2_mul(R, R);
-  
+
    add_fp2_jb_point.x = fe2_sub(add_fp2_jb_point.x, H3);
    add_fp2_jb_point.x = fe2_sub(add_fp2_jb_point.x, A);
-  
+
    A = fe2_mul(fe2_mul(U1, H), H);
    A = fe2_sub(A, add_fp2_jb_point.x);
    A = fe2_mul(A, R);
    add_fp2_jb_point.y = fe2_mul(S1, H3);
-  
+
    add_fp2_jb_point.y = fe2_sub(A, add_fp2_jb_point.y);
- 
+
  endfunction
-   
+
   function jb_point_t point_mult(logic [DAT_BITS-1:0] c, jb_point_t p);
     jb_point_t result, addend;
     result = 0;
@ -275,15 +275,29 @@ package bls12_381_pkg;
     return result;
   endfunction

+   function fp2_jb_point_t fp2_point_mult(logic [DAT_BITS-1:0] c, fp2_jb_point_t p);
+     fp2_jb_point_t result, addend;
+     result = 0;
+     addend = p;
+     while (c > 0) begin
+       if (c[0]) begin
+         result = add_fp2_jb_point(result, addend);
+       end
+       addend = dbl_fp2_jb_point(addend);
+       c = c >> 1;
+     end
+     return result;
+   endfunction
+
   function on_curve(jb_point_t p);
     return (p.y*p.y - p.x*p.x*p.x - secp256k1_pkg::a*p.x*p.z*p.z*p.z*p.z - secp256k1_pkg::b*p.z*p.z*p.z*p.z*p.z*p.z);
   endfunction
-   
+
   // Inversion using extended euclidean algorithm
   function fe_t fe_inv(fe_t a, b = 1);
      fe_t u, v;
      logic [$bits(fe_t):0] x1, x2;
-      
+
      u = a; v = P;
      x1 = b; x2 = 0;
      while (u != 1 && v != 1) begin
@ -299,7 +313,7 @@ package bls12_381_pkg;
          if (x2 % 2 == 0)
            x2 = x2 / 2;
         else
-           x2 = (x2 + P) / 2;      
+           x2 = (x2 + P) / 2;
        end
        if (u >= v) begin
          u = u - v;
@ -311,15 +325,15 @@ package bls12_381_pkg;
      end
      if (u == 1)
        return x1;
-      else 
+      else
        return x2;
-   endfunction 
-   
+   endfunction
+
   // This algorithm can also be used for division
   function fe_t fe_div(fe_t a, b);
     return fe_inv(a, b);
   endfunction
-   
+
   function fe2_t fe2_inv(fe2_t a);
     fe_t factor, t0, t1;
     t0 = fe_mul(a.c0, a.c0);
@ -328,7 +342,7 @@ package bls12_381_pkg;
     fe2_inv.c0 = fe_mul(a.c0, factor);
     fe2_inv.c1 = fe_mul(fe_sub(P, a.c1), factor);
   endfunction
-   
+
   function jb_point_t to_affine(jb_point_t p);
     fe_t z_;
     z_ = fe_mul(p.z, p.z);
@ -337,7 +351,7 @@ package bls12_381_pkg;
     z_ = fe_mul(z_, p.z);
     to_affine.y = fe_mul(p.y, fe_inv(z_));
   endfunction
-   
+
   function fp2_jb_point_t fp2_to_affine(fp2_jb_point_t p);
     fe2_t z_;
     z_ = fe2_mul(p.z, p.z);
@ -346,19 +360,19 @@ package bls12_381_pkg;
     z_ = fe2_mul(z_, p.z);
     fp2_to_affine.y = fe2_mul(p.y, fe2_inv(z_));
   endfunction
-   
+
   function print_jb_point(jb_point_t p);
     $display("x:%h", p.x);
     $display("y:%h", p.y);
     $display("z:%h", p.z);
     return;
   endfunction
-   
+
   function print_fp2_jb_point(fp2_jb_point_t p);
     $display("x:(c1:%h, c0:%h)", p.x.c1, p.x.c0);
     $display("y:(c1:%h, c0:%h)", p.y.c1, p.y.c0);
     $display("z:(c1:%h, c0:%h)", p.z.c1, p.z.c0);
     return;
-   endfunction   
+   endfunction

 endpackage