updates for Fp^2 point logic

2019-06-11 22:55:11 +08:00 · 2019-06-11 22:55:11 +08:00 · 4cef72abda
parent dcdbc97957
commit 4cef72abda
14 changed files with 1655 additions and 250 deletions
--- a/ip_cores/ec/src/rtl/ec_fp2_point_add.sv
+++ b/ip_cores/ec/src/rtl/ec_fp2_point_add.sv
@ -0,0 +1,337 @@
+/*
+  This performs Fp^2 point addition.
+  Is a wrapper around the Fp point addition module, but with logic
+  to handle the multiplications / subtractions / additions
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fp2_point_add
+#(
+  parameter type FP2_TYPE,   // Should have FE2_TYPE elements
+  parameter type FE_TYPE,
+  parameter type FE2_TYPE
+)(
+  input i_clk, i_rst,
+  // Input points
+  input FP2_TYPE i_p1,
+  input FP2_TYPE i_p2,
+  input logic    i_val,
+  output logic   o_rdy,
+  // Output point
+  output FP2_TYPE o_p,
+  input logic     i_rdy,
+  output logic    o_val,
+  output logic    o_err,
+  // Interface to FE_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_if,
+  if_axi_stream.sink   i_mul_if,
+  // Interface to FE_TYPE adder (mod P)
+  if_axi_stream.source o_add_if,
+  if_axi_stream.sink   i_add_if,
+  // Interface to FE_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_if,
+  if_axi_stream.sink   i_sub_if
+);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) mul_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   mul_if_fe2_o(i_clk);
+
+localparam ADD_CTL_BIT = 8;
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) add_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   add_if_fe2_o(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16))   add_if_fe_i [2] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_if_fe_o [2] (i_clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) sub_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   sub_if_fe2_o(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16))   sub_if_fe_i [2] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) sub_if_fe_o [2] (i_clk);
+
+
+
+
+// Point addtions are simple additions on each of the Fp elements
+enum {ADD0, ADD1} add_state;
+always_comb begin
+  add_if_fe2_i.rdy = add_state == ADD1 && (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy));
+  add_if_fe_i[0].rdy = ~add_if_fe2_o.val || (add_if_fe2_o.val && add_if_fe2_o.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    add_if_fe2_o.reset_source();
+    add_state <= ADD0;
+    add_if_fe_o[0].reset_source();
+  end else begin
+
+    if (add_if_fe_o[0].val && add_if_fe_o[0].rdy) add_if_fe_o[0].val <= 0;
+    if (add_if_fe2_o.val && add_if_fe2_o.rdy) add_if_fe2_o.val <= 0;
+
+    // One process to parse inputs and send them to the adder
+    case(add_state)
+      ADD0: begin
+        if (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy)) begin
+          add_if_fe_o[0].copy_if({add_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                                  add_if_fe2_i.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]}, 
+                                  add_if_fe2_i.val, 1, 1, add_if_fe2_i.err, add_if_fe2_i.mod, add_if_fe2_i.ctl);
+          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          if (add_if_fe2_i.val) add_state <= ADD1;
+        end
+      end
+      ADD1: begin
+        if (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy)) begin
+          add_if_fe_o[0].copy_if({add_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                                add_if_fe2_i.dat[$bits(FE2_TYPE)+$bits(FE_TYPE) +: $bits(FE_TYPE)]}, 
+                                add_if_fe2_i.val, 1, 1, add_if_fe2_i.err, add_if_fe2_i.mod, add_if_fe2_i.ctl);
+          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          if (add_if_fe2_i.val) add_state <= ADD0;
+        end
+      end
+    endcase
+
+    // One process to assign outputs
+    if (~add_if_fe2_o.val || (add_if_fe2_o.val && add_if_fe2_o.rdy)) begin
+      add_if_fe2_o.ctl <= add_if_fe_i[0].ctl;
+      if (add_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+        if (add_if_fe_i[0].val)
+          add_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= add_if_fe_i[0].dat;
+      end else begin
+        add_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= add_if_fe_i[0].dat;
+        add_if_fe2_o.val <= add_if_fe_i[0].val;
+      end
+    end
+  end
+end
+
+// Point subtractions are simple subtractions on each of the Fp elements
+enum {SUB0, SUB1} sub_state;
+always_comb begin
+  sub_if_fe2_i.rdy = sub_state == ADD1 && (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy));
+  sub_if_fe_i[0].rdy = ~sub_if_fe2_o.val || (sub_if_fe2_o.val && sub_if_fe2_o.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    sub_if_fe2_o.reset_source();
+    sub_state <= SUB0;
+    sub_if_fe_o[0].reset_source();
+  end else begin
+
+    if (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy) sub_if_fe_o[0].val <= 0;
+    if (sub_if_fe2_o.val && sub_if_fe2_o.rdy) sub_if_fe2_o.val <= 0;
+
+    // One process to parse inputs and send them to the subtractor
+    case(sub_state)
+      SUB0: begin
+        if (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy)) begin
+          sub_if_fe_o[0].copy_if({sub_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                                  sub_if_fe2_i.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]},
+                                  sub_if_fe2_i.val, 1, 1, sub_if_fe2_i.err, sub_if_fe2_i.mod, sub_if_fe2_i.ctl);
+          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          if (sub_if_fe2_i.val) sub_state <= SUB1;
+        end
+      end
+      SUB1: begin
+        if (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy)) begin
+          sub_if_fe_o[0].copy_if({sub_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                                sub_if_fe2_i.dat[$bits(FE_TYPE) + $bits(FE2_TYPE) +: $bits(FE_TYPE)]},
+                                sub_if_fe2_i.val, 1, 1, sub_if_fe2_i.err, sub_if_fe2_i.mod, sub_if_fe2_i.ctl);
+          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          if (sub_if_fe2_i.val) sub_state <= SUB0;
+        end
+      end
+    endcase
+
+    // One process to assign outputs
+    if (~sub_if_fe2_o.val || (sub_if_fe2_o.val && sub_if_fe2_o.rdy)) begin
+      sub_if_fe2_o.ctl <= sub_if_fe_i[0].ctl;
+      if (sub_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+        if (sub_if_fe_i[0].val)
+          sub_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[0].dat;
+      end else begin
+        sub_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= sub_if_fe_i[0].dat;
+        sub_if_fe2_o.val <= sub_if_fe_i[0].val;
+      end
+    end
+  end
+end
+
+// Multiplications are calculated as (a + bi)x(a' +b'i) = (aa' - bb') + (ab' + a'b)i
+// First 4 multiplications are issued, then 1 add and 1 subtraction (so we need arbitrator)
+enum {MUL0, MUL1, MUL2, MUL3} mul_state;
+logic [1:0] add_sub_val;
+always_comb begin
+  mul_if_fe2_i.rdy = mul_state == MUL3 && (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy));
+  i_mul_if.rdy = (mul_state == MUL1 || mul_state == MUL2) ? (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy)) :
+                  (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy));
+                  
+  // TODO check
+  
+  mul_if_fe2_o.val = &add_sub_val;
+  sub_if_fe_i[1].rdy = ~add_sub_val[1] || (~mul_if_fe2_o.val || (mul_if_fe2_o.val && mul_if_fe2_o.rdy));
+  add_if_fe_i[1].rdy = ~add_sub_val[0] || (~mul_if_fe2_o.val || (mul_if_fe2_o.val && mul_if_fe2_o.rdy));
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    add_sub_val <= 0;
+    mul_if_fe2_o.sop <= 0;
+    mul_if_fe2_o.eop <= 0;
+    mul_if_fe2_o.ctl <= 0;
+    mul_if_fe2_o.dat <= 0;
+    mul_if_fe2_o.mod <= 0;
+    mul_state <= MUL0;
+    o_mul_if.reset_source();
+    sub_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0);
+    add_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0);
+  end else begin
+
+    if (mul_if_fe2_o.val && mul_if_fe2_o.rdy) begin
+      add_sub_val <= 0;
+    end
+    if (o_mul_if.val && o_mul_if.rdy) o_mul_if.val <= 0;
+    if (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy) sub_if_fe_o[1].val <= 0;
+    if (add_if_fe_o[1].val && add_if_fe_o[1].rdy) add_if_fe_o[1].val <= 0;
+    
+    // One process to parse inputs and send them to the multiplier
+    if (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy)) begin
+      case (mul_state)
+        MUL0: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]}, 
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 0;
+          if (mul_if_fe2_i.val) mul_state <= MUL1;
+        end
+        MUL1: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]}, 
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 1;
+          if (mul_if_fe2_i.val) mul_state <= MUL2;
+        end
+        MUL2: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]}, 
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 2;
+          if (mul_if_fe2_i.val) mul_state <= MUL3;
+        end 
+        MUL3: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]}, 
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 3;
+          if (mul_if_fe2_i.val) mul_state <= MUL0;
+        end 
+      endcase
+    end
+    
+    // Process multiplications and do subtraction
+    if (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) begin
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 0) begin
+        if (i_mul_if.val) sub_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 1) begin
+        sub_if_fe_o[1].val <= i_mul_if.val;
+        sub_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      sub_if_fe_o[1].ctl <= i_mul_if.ctl;
+    end
+    
+    // Process multiplications and do addition
+    if (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy)) begin
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 2) begin
+        if (i_mul_if.val) add_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 3) begin
+        add_if_fe_o[1].val <= i_mul_if.val;
+        add_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      add_if_fe_o[1].ctl <= i_mul_if.ctl;
+    end
+
+    // One process to assign output
+    if (~mul_if_fe2_o.val || (mul_if_fe2_o.val && mul_if_fe2_o.rdy)) begin
+      mul_if_fe2_o.ctl <= add_if_fe_i[1].ctl;
+      if (~add_sub_val[0]) begin
+        mul_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= add_if_fe_i[1].dat;
+        add_sub_val[0] <= add_if_fe_i[1].val;
+      end
+      if (~add_sub_val[1]) begin
+        mul_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[1].dat;
+        add_sub_val[1] <= sub_if_fe_i[1].val;
+      end
+    end
+  end
+end
+
+ec_point_add #(
+  .FP_TYPE ( FP2_TYPE ),
+  .FE_TYPE ( FE2_TYPE )
+)
+ec_point_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+    // Input points
+  .i_p1  ( i_p1  ),
+  .i_p2  ( i_p2  ),
+  .i_val ( i_val ),
+  .o_rdy ( o_rdy ),
+  .o_p   ( o_p   ),
+  .o_err ( o_err ),
+  .i_rdy ( i_rdy ),
+  .o_val ( o_val ) ,
+  .o_mul_if ( mul_if_fe2_i ),
+  .i_mul_if ( mul_if_fe2_o ),
+  .o_add_if ( add_if_fe2_i  ),
+  .i_add_if ( add_if_fe2_o  ),
+  .o_sub_if ( sub_if_fe2_i  ),
+  .i_sub_if ( sub_if_fe2_o  )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 10 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_sub (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( sub_if_fe_o ),
+  .o_res ( o_sub_if    ),
+  .i_res ( i_sub_if    ),
+  .o_axi ( sub_if_fe_i )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 10 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( add_if_fe_o ),
+  .o_res ( o_add_if    ),
+  .i_res ( i_add_if    ),
+  .o_axi ( add_if_fe_i )
+);
+
+endmodule
--- a/ip_cores/ec/src/rtl/ec_fp2_point_dbl.sv
+++ b/ip_cores/ec/src/rtl/ec_fp2_point_dbl.sv
@ -0,0 +1,338 @@
+/*
+  This performs Fp^2 point addition.
+  Is a wrapper around the Fp point addition module, but with logic
+  to handle the multiplications / subtractions / additions
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fp2_point_dbl
+#(
+  parameter type FP2_TYPE,   // Should have FE2_TYPE elements
+  parameter type FE_TYPE,
+  parameter type FE2_TYPE
+)(
+  input i_clk, i_rst,
+  // Input points
+  input FP2_TYPE i_p,
+  input logic    i_val,
+  output logic   o_rdy,
+  // Output point
+  output FP2_TYPE o_p,
+  input logic     i_rdy,
+  output logic    o_val,
+  output logic    o_err,
+  // Interface to FE_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_if,
+  if_axi_stream.sink   i_mul_if,
+  // Interface to FE_TYPE adder (mod P)
+  if_axi_stream.source o_add_if,
+  if_axi_stream.sink   i_add_if,
+  // Interface to FE_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_if,
+  if_axi_stream.sink   i_sub_if
+);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) mul_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   mul_if_fe2_o(i_clk);
+
+localparam ADD_CTL_BIT = 8;
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) add_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   add_if_fe2_o(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16))   add_if_fe_i [2] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_if_fe_o [2] (i_clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(8)) sub_if_fe2_i(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(8))   sub_if_fe2_o(i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16))   sub_if_fe_i [2] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) sub_if_fe_o [2] (i_clk);
+
+
+
+
+// Point addtions are simple additions on each of the Fp elements
+enum {ADD0, ADD1} add_state;
+always_comb begin
+  add_if_fe2_i.rdy = add_state == ADD1 && (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy));
+  add_if_fe_i[0].rdy = ~add_if_fe2_o.val || (add_if_fe2_o.val && add_if_fe2_o.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    add_if_fe2_o.reset_source();
+    add_state <= ADD0;
+    add_if_fe_o[0].reset_source();
+  end else begin
+
+    if (add_if_fe_o[0].val && add_if_fe_o[0].rdy) add_if_fe_o[0].val <= 0;
+    if (add_if_fe2_o.val && add_if_fe2_o.rdy) add_if_fe2_o.val <= 0;
+
+    // One process to parse inputs and send them to the adder
+    case(add_state)
+      ADD0: begin
+        if (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy)) begin
+          add_if_fe_o[0].copy_if({add_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                                  add_if_fe2_i.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]},
+                                  add_if_fe2_i.val, 1, 1, add_if_fe2_i.err, add_if_fe2_i.mod, add_if_fe2_i.ctl);
+          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          if (add_if_fe2_i.val) add_state <= ADD1;
+        end
+      end
+      ADD1: begin
+        if (~add_if_fe_o[0].val || (add_if_fe_o[0].val && add_if_fe_o[0].rdy)) begin
+          add_if_fe_o[0].copy_if({add_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                                add_if_fe2_i.dat[$bits(FE2_TYPE)+$bits(FE_TYPE) +: $bits(FE_TYPE)]},
+                                add_if_fe2_i.val, 1, 1, add_if_fe2_i.err, add_if_fe2_i.mod, add_if_fe2_i.ctl);
+          add_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          if (add_if_fe2_i.val) add_state <= ADD0;
+        end
+      end
+    endcase
+
+    // One process to assign outputs
+    if (~add_if_fe2_o.val || (add_if_fe2_o.val && add_if_fe2_o.rdy)) begin
+      add_if_fe2_o.ctl <= add_if_fe_i[0].ctl;
+      if (add_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+        if (add_if_fe_i[0].val)
+          add_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= add_if_fe_i[0].dat;
+      end else begin
+        add_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= add_if_fe_i[0].dat;
+        add_if_fe2_o.val <= add_if_fe_i[0].val;
+      end
+    end
+  end
+end
+
+// Point subtractions are simple subtractions on each of the Fp elements
+enum {SUB0, SUB1} sub_state;
+always_comb begin
+  sub_if_fe2_i.rdy = sub_state == ADD1 && (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy));
+  sub_if_fe_i[0].rdy = ~sub_if_fe2_o.val || (sub_if_fe2_o.val && sub_if_fe2_o.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    sub_if_fe2_o.reset_source();
+    sub_state <= SUB0;
+    sub_if_fe_o[0].reset_source();
+  end else begin
+
+    if (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy) sub_if_fe_o[0].val <= 0;
+    if (sub_if_fe2_o.val && sub_if_fe2_o.rdy) sub_if_fe2_o.val <= 0;
+
+    // One process to parse inputs and send them to the subtractor
+    case(sub_state)
+      SUB0: begin
+        if (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy)) begin
+          sub_if_fe_o[0].copy_if({sub_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                                  sub_if_fe2_i.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]},
+                                  sub_if_fe2_i.val, 1, 1, sub_if_fe2_i.err, sub_if_fe2_i.mod, sub_if_fe2_i.ctl);
+          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 0;
+          if (sub_if_fe2_i.val) sub_state <= SUB1;
+        end
+      end
+      SUB1: begin
+        if (~sub_if_fe_o[0].val || (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy)) begin
+          sub_if_fe_o[0].copy_if({sub_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                                sub_if_fe2_i.dat[$bits(FE_TYPE) + $bits(FE2_TYPE) +: $bits(FE_TYPE)]},
+                                sub_if_fe2_i.val, 1, 1, sub_if_fe2_i.err, sub_if_fe2_i.mod, sub_if_fe2_i.ctl);
+          sub_if_fe_o[0].ctl[ADD_CTL_BIT] <= 1;
+          if (sub_if_fe2_i.val) sub_state <= SUB0;
+        end
+      end
+    endcase
+
+    // One process to assign outputs
+    if (~sub_if_fe2_o.val || (sub_if_fe2_o.val && sub_if_fe2_o.rdy)) begin
+      sub_if_fe2_o.ctl <= sub_if_fe_i[0].ctl;
+      if (sub_if_fe_i[0].ctl[ADD_CTL_BIT] == 0) begin
+        if (sub_if_fe_i[0].val)
+          sub_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[0].dat;
+      end else begin
+        sub_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= sub_if_fe_i[0].dat;
+        sub_if_fe2_o.val <= sub_if_fe_i[0].val;
+      end
+    end
+  end
+end
+
+// Multiplications are calculated as (a + bi)x(a' +b'i) = (aa' - bb') + (ab' + a'b)i
+// First 4 multiplications are issued, then 1 add and 1 subtraction (so we need arbitrator)
+enum {MUL0, MUL1, MUL2, MUL3} mul_state;
+logic [1:0] add_sub_val;
+always_comb begin
+  mul_if_fe2_i.rdy = mul_state == MUL3 && (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy));
+  
+  i_mul_if.rdy = (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 0 || i_mul_if.ctl[ADD_CTL_BIT +: 2] == 1) ?
+                  (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) :
+                  (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy));
+  
+  mul_if_fe2_o.val = &add_sub_val;
+  sub_if_fe_i[1].rdy = ~add_sub_val[1] || (mul_if_fe2_o.val && mul_if_fe2_o.rdy);
+  add_if_fe_i[1].rdy = ~add_sub_val[0] || (mul_if_fe2_o.val && mul_if_fe2_o.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    add_sub_val <= 0;
+    mul_if_fe2_o.sop <= 0;
+    mul_if_fe2_o.eop <= 0;
+    mul_if_fe2_o.ctl <= 0;
+    mul_if_fe2_o.dat <= 0;
+    mul_if_fe2_o.mod <= 0;
+    mul_state <= MUL0;
+    o_mul_if.reset_source();
+    sub_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0);
+    add_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0);
+  end else begin
+
+    if (mul_if_fe2_o.val && mul_if_fe2_o.rdy) begin
+      add_sub_val <= 0;
+    end
+    if (o_mul_if.val && o_mul_if.rdy) o_mul_if.val <= 0;
+    if (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy) sub_if_fe_o[1].val <= 0;
+    if (add_if_fe_o[1].val && add_if_fe_o[1].rdy) add_if_fe_o[1].val <= 0;
+
+    // One process to parse inputs and send them to the multiplier
+    if (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy)) begin
+      case (mul_state)
+        MUL0: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]},
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 0;
+          if (mul_if_fe2_i.val) mul_state <= MUL1;
+        end
+        MUL1: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]},
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 1;
+          if (mul_if_fe2_i.val) mul_state <= MUL2;
+        end
+        MUL2: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[0 +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]},
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 2;
+          if (mul_if_fe2_i.val) mul_state <= MUL3;
+        end
+        MUL3: begin
+          o_mul_if.copy_if({mul_if_fe2_i.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)],
+                            mul_if_fe2_i.dat[$bits(FE2_TYPE)  +: $bits(FE_TYPE)]},
+                            mul_if_fe2_i.val, 1, 1, mul_if_fe2_i.err, mul_if_fe2_i.mod, mul_if_fe2_i.ctl);
+          o_mul_if.ctl[ADD_CTL_BIT +: 2] <= 3;
+          if (mul_if_fe2_i.val) mul_state <= MUL0;
+        end
+      endcase
+    end
+
+    // Process multiplications and do subtraction
+    if (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) begin
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 0) begin
+        if (i_mul_if.val) sub_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 1) begin
+        sub_if_fe_o[1].val <= i_mul_if.val;
+        sub_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      sub_if_fe_o[1].ctl <= i_mul_if.ctl;
+    end
+
+    // Process multiplications and do addition
+    if (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy)) begin
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 2) begin
+        if (i_mul_if.val) add_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      if (i_mul_if.ctl[ADD_CTL_BIT +: 2] == 3) begin
+        add_if_fe_o[1].val <= i_mul_if.val;
+        add_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_if.dat;
+      end
+      add_if_fe_o[1].ctl <= i_mul_if.ctl;
+    end
+
+    // One process to assign output
+    if (~add_sub_val[0] || (mul_if_fe2_o.val && mul_if_fe2_o.rdy)) begin
+      mul_if_fe2_o.ctl <= add_if_fe_i[1].ctl;
+      //if (~add_sub_val[0]) begin
+        mul_if_fe2_o.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= add_if_fe_i[1].dat;
+        add_sub_val[0] <= add_if_fe_i[1].val;
+    end
+    
+    if (~add_sub_val[1] || (mul_if_fe2_o.val && mul_if_fe2_o.rdy)) begin
+      //end
+      //if (~add_sub_val[1]) begin
+        mul_if_fe2_o.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[1].dat;
+        add_sub_val[1] <= sub_if_fe_i[1].val;
+      //end
+    end
+  end
+end
+
+ec_point_dbl #(
+  .FP_TYPE ( FP2_TYPE ),
+  .FE_TYPE ( FE2_TYPE )
+)
+ec_point_dbl (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+    // Input points
+  .i_p   ( i_p  ),
+  .i_val ( i_val ),
+  .o_rdy ( o_rdy ),
+  .o_p   ( o_p   ),
+  .o_err ( o_err ),
+  .i_rdy ( i_rdy ),
+  .o_val ( o_val ) ,
+  .o_mul_if ( mul_if_fe2_i ),
+  .i_mul_if ( mul_if_fe2_o ),
+  .o_add_if ( add_if_fe2_i  ),
+  .i_add_if ( add_if_fe2_o  ),
+  .o_sub_if ( sub_if_fe2_i  ),
+  .i_sub_if ( sub_if_fe2_o  )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 10 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_sub (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( sub_if_fe_o ),
+  .o_res ( o_sub_if    ),
+  .i_res ( i_sub_if    ),
+  .o_axi ( sub_if_fe_i )
+);
+
+resource_share # (
+  .NUM_IN ( 2 ),
+  .OVR_WRT_BIT ( 10 ),
+  .PIPELINE_IN ( 0  ),
+  .PIPELINE_OUT ( 0 )
+)
+resource_share_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( add_if_fe_o ),
+  .o_res ( o_add_if    ),
+  .i_res ( i_add_if    ),
+  .o_axi ( add_if_fe_i )
+);
+
+endmodule
--- a/ip_cores/ec/src/rtl/ec_fp_point_mult.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_point_mult.sv
@ -333,7 +333,7 @@ generate

    ec_fp_mult_mod #(
      .P             ( P  ),
-      .KARATSUBA_LVL ( 3  ),
+      .KARATSUBA_LVL ( 2  ),
      .CTL_BITS      ( 16 )
    )
    ec_fp_mult_mod (
--- a/ip_cores/ec/src/rtl/ec_point_add.sv
+++ b/ip_cores/ec/src/rtl/ec_point_add.sv
@ -19,7 +19,6 @@

 module ec_point_add
 #(
-  parameter      P,
  parameter type FP_TYPE,
  parameter type FE_TYPE
 )(
@ -35,8 +34,8 @@ module ec_point_add
  output logic   o_val,
  output logic   o_err,
  // Interface to multiplier (mod P)
-  if_axi_stream.source o_mult_if,
-  if_axi_stream.sink   i_mult_if,
+  if_axi_stream.source o_mul_if,
+  if_axi_stream.sink   i_mul_if,
  // Interface to adder (mod P)
  if_axi_stream.source o_add_if,
  if_axi_stream.sink   i_add_if,
@ -103,11 +102,11 @@ always_ff @ (posedge i_clk) begin
    o_val <= 0;
    o_rdy <= 0;
    o_p <= 0;
-    o_mult_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_mul_if.copy_if(0, 0, 1, 1, 0, 0, 0);
    o_add_if.copy_if(0, 0, 1, 1, 0, 0, 0);
    o_sub_if.copy_if(0, 0, 1, 1, 0, 0, 0);
    i_add_if.rdy <= 0;
-    i_mult_if.rdy <= 0;
+    i_mul_if.rdy <= 0;
    i_sub_if.rdy <= 0;
    eq_val <= 0;
    state <= IDLE;
@ -121,7 +120,7 @@ always_ff @ (posedge i_clk) begin
    D <= 0;
  end else begin

-    if (o_mult_if.rdy) o_mult_if.val <= 0;
+    if (o_mul_if.rdy) o_mul_if.val <= 0;
    if (o_add_if.rdy) o_add_if.val <= 0;
    if (o_sub_if.rdy) o_sub_if.val <= 0;

@ -131,7 +130,7 @@ always_ff @ (posedge i_clk) begin
        eq_val <= 0;
        eq_wait <= 0;
        o_err <= 0;
-        i_mult_if.rdy <= 1;
+        i_mul_if.rdy <= 1;
        i_add_if.rdy <= 1;
        i_sub_if.rdy <= 1;
        i_p1_l <= i_p1;
@ -168,25 +167,25 @@ always_ff @ (posedge i_clk) begin
      {START}: begin

        // Check any results from multiplier
-        if (i_mult_if.val && i_mult_if.rdy) begin
-          eq_val[i_mult_if.ctl[5:0]] <= 1;
-          case(i_mult_if.ctl[5:0]) inside
-            0: A <= i_mult_if.dat;
-            1: i_p1_l.x <= i_mult_if.dat;
-            2: C <= i_mult_if.dat;
-            3: i_p2_l.x <= i_mult_if.dat;
-            4: A <= i_mult_if.dat;
-            5: A <= i_mult_if.dat;
-            6: C <= i_mult_if.dat;
-            7: C <= i_mult_if.dat;
-            10: o_p.x <= i_mult_if.dat;
-            11: D <= i_mult_if.dat;
-            12: i_p2_l.x <= i_mult_if.dat;
-            14: i_p1_l.x <= i_mult_if.dat;
-            19: o_p.y <= i_mult_if.dat;
-            20: i_p2_l.x <= i_mult_if.dat;
-            22: o_p.z <= i_mult_if.dat;
-            23: o_p.z <= i_mult_if.dat;
+        if (i_mul_if.val && i_mul_if.rdy) begin
+          eq_val[i_mul_if.ctl[5:0]] <= 1;
+          case(i_mul_if.ctl[5:0]) inside
+            0: A <= FE_TYPE'(i_mul_if.dat);
+            1: i_p1_l.x <= i_mul_if.dat;
+            2: C <= i_mul_if.dat;
+            3: i_p2_l.x <= i_mul_if.dat;
+            4: A <= i_mul_if.dat;
+            5: A <= i_mul_if.dat;
+            6: C <= i_mul_if.dat;
+            7: C <= i_mul_if.dat;
+            10: o_p.x <= i_mul_if.dat;
+            11: D <= i_mul_if.dat;
+            12: i_p2_l.x <= i_mul_if.dat;
+            14: i_p1_l.x <= i_mul_if.dat;
+            19: o_p.y <= i_mul_if.dat;
+            20: i_p2_l.x <= i_mul_if.dat;
+            22: o_p.z <= i_mul_if.dat;
+            23: o_p.z <= i_mul_if.dat;
            default: o_err <= 1;
          endcase
        end
@ -345,11 +344,11 @@ endtask

 // Task for using multiplies
 task multiply(input int unsigned ctl, input FE_TYPE a, b);
-  if (~o_mult_if.val || (o_mult_if.val && o_mult_if.rdy)) begin
-    o_mult_if.val <= 1;
-    o_mult_if.dat[0 +: $bits(FE_TYPE)] <= a;
-    o_mult_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
-    o_mult_if.ctl[5:0] <= ctl;
+  if (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy)) begin
+    o_mul_if.val <= 1;
+    o_mul_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_mul_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
+    o_mul_if.ctl[5:0] <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask
--- a/ip_cores/ec/src/rtl/ec_fp_point_dbl.sv
+++ b/ip_cores/ec/src/rtl/ec_fp_point_dbl.sv
@ -17,31 +17,30 @@
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

-module ec_fp_point_dbl
+module ec_point_dbl
 #(
-  parameter      P,
-  parameter type POINT_TYPE
+  parameter type FP_TYPE,
+  parameter type FE_TYPE
 )(
  input i_clk, i_rst,
  // Input point
-  input POINT_TYPE i_p,
-  input logic      i_val,
-  output logic     o_rdy,
+  input FP_TYPE i_p,
+  input logic   i_val,
+  output logic  o_rdy,
  // Output point
-  output POINT_TYPE o_p,
-  input logic       i_rdy,
-  output logic      o_val,
-  output logic      o_err,
+  output FP_TYPE o_p,
+  input logic    i_rdy,
+  output logic   o_val,
+  output logic   o_err,
  // Interface to multiplier (mod p)
-  if_axi_stream.source o_mult_if,
-  if_axi_stream.sink   i_mult_if,
+  if_axi_stream.source o_mul_if,
+  if_axi_stream.sink   i_mul_if,
  if_axi_stream.source o_add_if,
  if_axi_stream.sink   i_add_if,
  if_axi_stream.source o_sub_if,
  if_axi_stream.sink   i_sub_if
 );

-localparam DAT_BITS = $clog2(P);
 /*
 * These are the equations that need to be computed, they are issued as variables
 * become valid. We have a bitmask to track what equation results are valid which
@ -66,8 +65,8 @@ localparam DAT_BITS = $clog2(P);
 logic [14:0] eq_val, eq_wait;

 // Temporary variables
-logic [DAT_BITS-1:0] A, B, C, D, E;
-POINT_TYPE i_p_l;
+FE_TYPE A, B, C, D, E;
+FP_TYPE i_p_l;


 enum {IDLE, START, FINISHED} state;
@ -76,12 +75,12 @@ always_ff @ (posedge i_clk) begin
    o_val <= 0;
    o_rdy <= 0;
    o_p <= 0;
-    o_mult_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_mul_if.copy_if(0, 0, 1, 1, 0, 0, 0);
    o_add_if.copy_if(0, 0, 1, 1, 0, 0, 0);
    o_sub_if.copy_if(0, 0, 1, 1, 0, 0, 0);
-    i_mult_if.rdy <= 0;
-    i_add_if.rdy <= 0; 
-    i_sub_if.rdy <= 0;  
+    i_mul_if.rdy <= 0;
+    i_add_if.rdy <= 0;
+    i_sub_if.rdy <= 0;
    eq_val <= 0;
    state <= IDLE;
    eq_wait <= 0;
@ -94,7 +93,7 @@ always_ff @ (posedge i_clk) begin
    E <= 0;
  end else begin

-    if (o_mult_if.rdy) o_mult_if.val <= 0;
+    if (o_mul_if.rdy) o_mul_if.val <= 0;
    if (o_add_if.rdy) o_add_if.val <= 0;
    if (o_sub_if.rdy) o_sub_if.val <= 0;

@ -104,7 +103,7 @@ always_ff @ (posedge i_clk) begin
        eq_val <= 0;
        eq_wait <= 0;
        o_err <= 0;
-        i_mult_if.rdy <= 1;
+        i_mul_if.rdy <= 1;
        i_add_if.rdy <= 1;
        i_sub_if.rdy <= 1;
        i_p_l <= i_p;
@ -127,26 +126,26 @@ always_ff @ (posedge i_clk) begin
      // Just a big if tree where we issue equations if the required inputs
      // are valid
      {START}: begin
-        i_mult_if.rdy <= 1;
+        i_mul_if.rdy <= 1;

        // Check any results from multiplier
-        if (i_mult_if.val && i_mult_if.rdy) begin
-          eq_val[i_mult_if.ctl[5:0]] <= 1;
-          case(i_mult_if.ctl[5:0]) inside
-            0: A <= i_mult_if.dat;
-            1: B <= i_mult_if.dat;
-            2: B <= i_mult_if.dat;
-            3: C <= i_mult_if.dat;
-            4: C <= i_mult_if.dat;
-            5: D <= i_mult_if.dat;
-            6: D <= i_mult_if.dat;
-            7: o_p.x <= i_mult_if.dat;
-            11: o_p.y <= i_mult_if.dat;
-            14: o_p.z <= i_mult_if.dat;
+        if (i_mul_if.val && i_mul_if.rdy) begin
+          eq_val[i_mul_if.ctl[5:0]] <= 1;
+          case(i_mul_if.ctl[5:0]) inside
+            0: A <= i_mul_if.dat;
+            1: B <= i_mul_if.dat;
+            2: B <= i_mul_if.dat;
+            3: C <= i_mul_if.dat;
+            4: C <= i_mul_if.dat;
+            5: D <= i_mul_if.dat;
+            6: D <= i_mul_if.dat;
+            7: o_p.x <= i_mul_if.dat;
+            11: o_p.y <= i_mul_if.dat;
+            14: o_p.z <= i_mul_if.dat;
            default: o_err <= 1;
          endcase
        end
-        
+
        // Check any results from adder
        if (i_add_if.val && i_add_if.rdy) begin
          eq_val[i_add_if.ctl[5:0]] <= 1;
@ -156,7 +155,7 @@ always_ff @ (posedge i_clk) begin
            default: o_err <= 1;
          endcase
        end
-        
+
        // Check any results from subtractor
        if (i_sub_if.val && i_sub_if.rdy) begin
          eq_val[i_sub_if.ctl[5:0]] <= 1;
@ -219,9 +218,6 @@ always_ff @ (posedge i_clk) begin
          addition(13, i_p_l.y, i_p_l.y);
        end

-
-
-
        if (&eq_val) begin
          state <= FINISHED;
          o_val <= 1;
@ -248,38 +244,36 @@ always_ff @ (posedge i_clk) begin
 end

 // Task for subtractions
-task subtraction(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
+task subtraction(input int unsigned ctl, input FE_TYPE a, b);
  if (~o_sub_if.val || (o_sub_if.val && o_sub_if.rdy)) begin
    o_sub_if.val <= 1;
-    o_sub_if.dat[0 +: DAT_BITS] <= a;
-    o_sub_if.dat[DAT_BITS +: DAT_BITS] <= b;
+    o_sub_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_sub_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
    o_sub_if.ctl[5:0] <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask

 // Task for addition
-task addition(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
+task addition(input int unsigned ctl, input FE_TYPE a, b);
  if (~o_add_if.val || (o_add_if.val && o_add_if.rdy)) begin
    o_add_if.val <= 1;
-    o_add_if.dat[0 +: DAT_BITS] <= a;
-    o_add_if.dat[DAT_BITS +: DAT_BITS] <= b;
+    o_add_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_add_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
    o_add_if.ctl[5:0] <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask

 // Task for using multiplies
-task multiply(input int unsigned ctl, input logic [DAT_BITS-1:0] a, b);
-  if (~o_mult_if.val || (o_mult_if.val && o_mult_if.rdy)) begin
-    o_mult_if.val <= 1;
-    o_mult_if.dat[0 +: DAT_BITS] <= a;
-    o_mult_if.dat[DAT_BITS +: DAT_BITS] <= b;
-    o_mult_if.ctl[5:0] <= ctl;
+task multiply(input int unsigned ctl, input FE_TYPE a, b);
+  if (~o_mul_if.val || (o_mul_if.val && o_mul_if.rdy)) begin
+    o_mul_if.val <= 1;
+    o_mul_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_mul_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
+    o_mul_if.ctl[5:0] <= ctl;
    eq_wait[ctl] <= 1;
  end
 endtask

-
-
 endmodule
--- a/ip_cores/ec/src/tb/ec_fp2_point_add_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_add_tb.sv
@ -0,0 +1,226 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module ec_fp2_point_add_tb ();
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+localparam CLK_PERIOD = 1000;
+
+logic clk, rst;
+
+if_axi_stream #(.DAT_BYTS(($bits(fp2_jb_point_t)*2+7)/8)) in_if(clk); // Two points
+if_axi_stream #(.DAT_BYTS(($bits(fp2_jb_point_t)+7)/8)) out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) mult_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) mult_out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) add_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) add_out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) sub_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) sub_out_if(clk);
+
+fp2_jb_point_t in_p1, in_p2, out_p;
+
+always_comb begin
+  in_p1 = in_if.dat[0 +: $bits(fp2_jb_point_t)];
+  in_p2 = in_if.dat[$bits(fp2_jb_point_t) +: $bits(fp2_jb_point_t)];
+  out_if.dat = out_p;
+end
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err)
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+
+ec_fp2_point_add #(
+  .FP2_TYPE ( fp2_jb_point_t ),
+  .FE_TYPE  ( fe_t           ),
+  .FE2_TYPE ( fe2_t          )
+)
+ec_fp2_point_add (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input points
+  .i_p1  ( in_p1      ),
+  .i_p2  ( in_p2      ),
+  .i_val ( in_if.val  ),
+  .o_rdy ( in_if.rdy  ),
+  .o_p   ( out_p      ),
+  .o_err ( out_if.err ),
+  .i_rdy ( out_if.rdy ),
+  .o_val ( out_if.val ) ,
+  .o_mul_if ( mult_in_if ),
+  .i_mul_if ( mult_out_if ),
+  .o_add_if ( add_in_if ),
+  .i_add_if ( add_out_if ),
+  .o_sub_if ( sub_in_if ),
+  .i_sub_if ( sub_out_if )
+);
+
+always_comb begin
+  mult_out_if.sop = 1;
+  mult_out_if.eop = 1;
+  mult_out_if.err = 0;
+  mult_out_if.mod = 1;
+
+  add_out_if.sop = 1;
+  add_out_if.eop = 1;
+  add_out_if.err = 0;
+  add_out_if.mod = 1;
+
+  sub_out_if.sop = 1;
+  sub_out_if.eop = 1;
+  sub_out_if.err = 0;
+  sub_out_if.mod = 1;
+end
+
+
+// Attach a mod reduction unit and multiply - mod unit
+ec_fp_mult_mod #(
+  .P             ( P   ),
+  .KARATSUBA_LVL ( 3   ),
+  .CTL_BITS      ( 16  )
+)
+ec_fp_mult_mod (
+  .i_clk( clk         ),
+  .i_rst( rst         ),
+  .i_dat_a ( mult_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( mult_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_val ( mult_in_if.val ),
+  .i_ctl ( mult_in_if.ctl ),
+  .o_rdy ( mult_in_if.rdy ),
+  .o_dat ( mult_out_if.dat ),
+  .i_rdy ( mult_out_if.rdy ),
+  .o_val ( mult_out_if.val ),
+  .o_ctl ( mult_out_if.ctl )
+);
+
+adder_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+adder_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_ctl ( add_in_if.ctl ),
+  .i_val ( add_in_if.val  ),
+  .o_rdy ( add_in_if.rdy  ),
+  .o_dat ( add_out_if.dat ),
+  .o_val ( add_out_if.val ),
+  .o_ctl ( add_out_if.ctl ),
+  .i_rdy ( add_out_if.rdy )
+);
+
+subtractor_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+subtractor_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_ctl ( sub_in_if.ctl ),
+  .i_val ( sub_in_if.val  ),
+  .o_rdy ( sub_in_if.rdy  ),
+  .o_dat ( sub_out_if.dat ),
+  .o_val ( sub_out_if.val ),
+  .o_ctl ( sub_out_if.ctl ),
+  .i_rdy ( sub_out_if.rdy )
+);
+
+task test(input fp2_jb_point_t p1, p2, p_exp);
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
+  fp2_jb_point_t p_out;
+  $display("Running test ...");
+
+
+  fork
+    in_if.put_stream({p2, p1}, ((2*$bits(fp2_jb_point_t)+7)/8));
+    out_if.get_stream(get_dat, get_len);
+  join
+
+  p_out = get_dat;
+
+  $display("Expected:");
+  print_fp2_jb_point(p_exp);
+  $display("Was:");
+  print_fp2_jb_point(p_out);
+
+  if (p_exp != p_out) begin
+    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
+  end
+
+  $display("test PASSED");
+
+end
+endtask;
+
+fp2_jb_point_t one_point = '{x:FE2_one, y:FE2_one, z:FE2_one};
+fp2_jb_point_t two_point = '{x:'{c1:381'd2, c0:381'd2}, y:'{c1:381'd2, c0:381'd2}, z:FE2_one};
+
+fp2_jb_point_t g2_point_dbl = '{x:'{c0:381'd2004569552561385659566932407633616698939912674197491321901037400001042336021538860336682240104624979660689237563240,
+         c1:381'd3955604752108186662342584665293438104124851975447411601471797343177761394177049673802376047736772242152530202962941},
+         y:'{c0:381'd978142457653236052983988388396292566217089069272380812666116929298652861694202207333864830606577192738105844024927,
+         c1:381'd2248711152455689790114026331322133133284196260289964969465268080325775757898907753181154992709229860715480504777099},
+         z:'{c0:381'd3145673658656250241340817105688138628074744674635286712244193301767486380727788868972774468795689607869551989918920,
+         c1:381'd968254395890002185853925600926112283510369004782031018144050081533668188797348331621250985545304947843412000516197}};
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+
+ test(g2_point, g2_point_dbl, add_fp2_jb_point(g2_point, g2_point_dbl)
+     /*'{x:'{c0:381'd2260316515795278483227354417550273673937385151660885802822200676798473320332386191812885909324314180009401590033496,
+        c1:381'd3157705674295752746643045744187038651144673626385096899515739718638356953289853357506730468806346866010850469607484},
+        y:'{c0:381'd3116406908094559010983016654096953279342014296159903648784769141704444407188785914041577477129027384530629024324101,
+        c1:381'd624739198846365065958511422206549337298084868949577950118937104460230094422413163466712508875838914229203179007739},
+        z:'{c0:381'd1372365362697527824661960056804989242334959973433633343888520294361286317391588271032081626721722944066233963018813,
+        c1:381'd135340553306575460225879133388402231094623862625345515492709522456301372944095308361691014711792956665222682354141}}*/);
+
+  #1us $finish();
+end
+endmodule
--- a/ip_cores/ec/src/tb/ec_fp2_point_dbl_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_dbl_tb.sv
@ -0,0 +1,211 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module ec_fp2_point_dbl_tb ();
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+localparam CLK_PERIOD = 1000;
+
+logic clk, rst;
+
+if_axi_stream #(.DAT_BYTS(($bits(fp2_jb_point_t)+7)/8)) in_if(clk); // One point
+if_axi_stream #(.DAT_BYTS(($bits(fp2_jb_point_t)+7)/8)) out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) mult_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) mult_out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) add_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) add_out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) sub_in_if(clk);
+if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(16)) sub_out_if(clk);
+
+fp2_jb_point_t in_p1, out_p;
+
+always_comb begin
+  in_p1 = in_if.dat;
+  out_if.dat = out_p;
+end
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+  out_if.ctl = 0;
+  out_if.mod = 0;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err)
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+
+ec_fp2_point_dbl #(
+  .FP2_TYPE ( fp2_jb_point_t ),
+  .FE_TYPE  ( fe_t           ),
+  .FE2_TYPE ( fe2_t          )
+)
+ec_fp2_point_dbl (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+    // Input points
+  .i_p   ( in_p1      ),
+  .i_val ( in_if.val  ),
+  .o_rdy ( in_if.rdy  ),
+  .o_p   ( out_p      ),
+  .o_err ( out_if.err ),
+  .i_rdy ( out_if.rdy ),
+  .o_val ( out_if.val ) ,
+  .o_mul_if ( mult_in_if ),
+  .i_mul_if ( mult_out_if ),
+  .o_add_if ( add_in_if ),
+  .i_add_if ( add_out_if ),
+  .o_sub_if ( sub_in_if ),
+  .i_sub_if ( sub_out_if )
+);
+
+always_comb begin
+  mult_out_if.sop = 1;
+  mult_out_if.eop = 1;
+  mult_out_if.err = 0;
+  mult_out_if.mod = 1;
+
+  add_out_if.sop = 1;
+  add_out_if.eop = 1;
+  add_out_if.err = 0;
+  add_out_if.mod = 1;
+
+  sub_out_if.sop = 1;
+  sub_out_if.eop = 1;
+  sub_out_if.err = 0;
+  sub_out_if.mod = 1;
+end
+
+
+// Attach a mod reduction unit and multiply - mod unit
+ec_fp_mult_mod #(
+  .P             ( P   ),
+  .KARATSUBA_LVL ( 3   ),
+  .CTL_BITS      ( 16  )
+)
+ec_fp_mult_mod (
+  .i_clk( clk         ),
+  .i_rst( rst         ),
+  .i_dat_a ( mult_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( mult_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_val ( mult_in_if.val ),
+  .i_ctl ( mult_in_if.ctl ),
+  .o_rdy ( mult_in_if.rdy ),
+  .o_dat ( mult_out_if.dat ),
+  .i_rdy ( mult_out_if.rdy ),
+  .o_val ( mult_out_if.val ),
+  .o_ctl ( mult_out_if.ctl )
+);
+
+adder_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+adder_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_dat_a ( add_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( add_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_ctl ( add_in_if.ctl ),
+  .i_val ( add_in_if.val  ),
+  .o_rdy ( add_in_if.rdy  ),
+  .o_dat ( add_out_if.dat ),
+  .o_val ( add_out_if.val ),
+  .o_ctl ( add_out_if.ctl ),
+  .i_rdy ( add_out_if.rdy )
+);
+
+subtractor_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P   ),
+  .CTL_BITS ( 16  ),
+  .LEVEL    ( 2   )
+)
+subtractor_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_dat_a ( sub_in_if.dat[0 +: bls12_381_pkg::DAT_BITS] ),
+  .i_dat_b ( sub_in_if.dat[bls12_381_pkg::DAT_BITS +: bls12_381_pkg::DAT_BITS] ),
+  .i_ctl ( sub_in_if.ctl ),
+  .i_val ( sub_in_if.val  ),
+  .o_rdy ( sub_in_if.rdy  ),
+  .o_dat ( sub_out_if.dat ),
+  .o_val ( sub_out_if.val ),
+  .o_ctl ( sub_out_if.ctl ),
+  .i_rdy ( sub_out_if.rdy )
+);
+
+task test(input fp2_jb_point_t p1, p_exp);
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
+  fp2_jb_point_t p_out;
+  $display("Running test ...");
+
+
+  fork
+    in_if.put_stream(p1, (($bits(fp2_jb_point_t)+7)/8));
+    out_if.get_stream(get_dat, get_len, 0);
+  join
+
+  p_out = get_dat;
+  
+  $display("Input:");
+  print_fp2_jb_point(p1);
+  $display("Expected:");
+  print_fp2_jb_point(p_exp);
+  $display("Was:");
+  print_fp2_jb_point(p_out);
+
+  if (p_exp != p_out) begin
+    $fatal(1, "%m %t ERROR: test_0 point was wrong", $time);
+  end
+
+  $display("test PASSED");
+
+end
+endtask;
+
+
+initial begin
+  out_if.rdy = 0;
+  in_if.val = 0;
+  #(40*CLK_PERIOD);
+
+ test(g2_point, dbl_fp2_jb_point(g2_point));
+
+  #1us $finish();
+end
+endmodule
--- a/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_dbl_tb.sv
@ -16,7 +16,7 @@
 */
 `timescale 1ps/1ps

-module ec_fp_point_dbl_tb ();
+module ec_point_dbl_tb ();

 import common_pkg::*;
 import bls12_381_pkg::*;
@ -68,11 +68,11 @@ always_ff @ (posedge clk)
  if (out_if.val && out_if.err)
    $error(1, "%m %t ERROR: output .err asserted", $time);

-ec_fp_point_dbl #(
-  .P          ( P ),
-  .POINT_TYPE ( jb_point_t )
+ec_point_dbl #(
+  .FP_TYPE ( jb_point_t ),
+  .FE_TYPE ( fe_t )
 )
-ec_fp_point_dbl (
+ec_point_dbl (
  .i_clk ( clk ),
  .i_rst ( rst ),
  .i_p   ( in_p ),
@ -82,8 +82,8 @@ ec_fp_point_dbl (
  .o_err ( out_if.err ),
  .i_rdy ( out_if.rdy ),
  .o_val  ( out_if.val ) ,
-  .o_mult_if ( mult_in_if ),
-  .i_mult_if ( mult_out_if ),
+  .o_mul_if ( mult_in_if ),
+  .i_mul_if ( mult_out_if ),
  .o_add_if ( add_in_if ),
  .i_add_if ( add_out_if ),
  .o_sub_if ( sub_in_if ),
@ -102,7 +102,6 @@ ec_fp_mult_mod (
  .i_dat_a ( mult_in_if.dat[0 +: 381] ),
  .i_dat_b ( mult_in_if.dat[381 +: 381] ),
  .i_val ( mult_in_if.val ),
-  .i_err ( mult_in_if.err ),
  .i_ctl ( mult_in_if.ctl ),
  .o_rdy ( mult_in_if.rdy ),
  .o_dat ( mult_out_if.dat ),
@ -156,16 +155,16 @@ always_comb begin
  mult_out_if.eop = 1;
  mult_out_if.err = 0;
  mult_out_if.mod = 1;
-  
+
  add_out_if.sop = 1;
  add_out_if.eop = 1;
  add_out_if.err = 0;
  add_out_if.mod = 1;
-  
+
  sub_out_if.sop = 1;
  sub_out_if.eop = 1;
  sub_out_if.err = 0;
-  sub_out_if.mod = 1;  
+  sub_out_if.mod = 1;
 end

 task test_0();
@ -200,14 +199,13 @@ begin
 end
 endtask;

-function compare_point();
-
-endfunction
-
 initial begin
  out_if.rdy = 0;
  in_if.val = 0;
  #(40*CLK_PERIOD);
+  
+  print_jb_point(to_affine(g_point));
+  print_jb_point(to_affine(dbl_jb_point(g_point)));

  test_0();

--- a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
@ -148,9 +148,10 @@ initial begin
  in_if.val = 0;
  #(40*CLK_PERIOD);

-  test(381'h1);
+  /*test(381'h1);
  test(381'h5);
-  test(381'h10);
+  test(381'h10);*/
+  //test(1 << 380);
  test(381'h9f5193de96ab6e65e7c7df8adcec4e82b971dd5f54d1c62103776d3eef0255ae346eba9e29eb08c3a957e9a53afc3ce);
    
  #1us $finish();
--- a/ip_cores/ec/src/tb/ec_point_add_tb.sv
+++ b/ip_cores/ec/src/tb/ec_point_add_tb.sv
@ -39,8 +39,8 @@ if_axi_stream #(.DAT_BITS(bls12_381_pkg::DAT_BITS), .CTL_BITS(8)) sub_out_if(clk
 jb_point_t in_p1, in_p2, out_p;

 always_comb begin
-  in_p1 = in_if.dat[0 +: bls12_381_pkg::DAT_BITS*3];
-  in_p2 = in_if.dat[bls12_381_pkg::DAT_BITS*3 +: bls12_381_pkg::DAT_BITS*3];
+  in_p1 = in_if.dat[0 +: $bits(jb_point_t)];
+  in_p2 = in_if.dat[$bits(jb_point_t) +: $bits(jb_point_t)];
  out_if.dat = out_p;
 end

@ -67,9 +67,8 @@ always_ff @ (posedge clk)
    $error(1, "%m %t ERROR: output .err asserted", $time);

 ec_point_add #(
-  .P       ( P ),
  .FP_TYPE ( jb_point_t ),
-  .FE_TPYE ( fe_t )
+  .FE_TYPE ( fe_t )
 )
 ec_point_add (
  .i_clk ( clk ),
--- a/ip_cores/memory/src/rtl/uram.sv
+++ b/ip_cores/memory/src/rtl/uram.sv
@ -16,7 +16,7 @@ module uram_reset #(
  if_ram.sink b
 );

-if_ram #(.RAM_WIDTH(RAM_WIDTH), .RAM_DEPTH(RAM_DEPTH)) if_ram_a(.i_clk(a.i_clk), .i_rst(a.i_rst));
+if_ram #(.RAM_WIDTH(RAM_WIDTH), .RAM_DEPTH(RAM_DEPTH), .BYT_EN($bits(a.we))) if_ram_a(.i_clk(a.i_clk), .i_rst(a.i_rst));

 logic reset_done;
 logic [RAM_DEPTH-1:0] addr;
@ -34,7 +34,7 @@ end
 always_comb begin
  if_ram_a.a =  reset_done ? a.a : addr;
  if_ram_a.en = reset_done ? a.en : 1'd1;
-  if_ram_a.we = reset_done ? a.we : 1'd1;
+  if_ram_a.we = reset_done ? a.we : {$bits(a.we){1'd1}};
  if_ram_a.re = a.re;
  if_ram_a.d =  reset_done ? a.d : {RAM_WIDTH{1'd0}};
  a.q = if_ram_a.q;
@ -64,85 +64,96 @@ module uram #(
 // Check RAM sizes match the interface
 initial begin
  assert ($bits(a.d) == RAM_WIDTH) else $fatal(1, "%m %t ERROR: bram RAM_WIDTH (%d) does not match interface a (%d)", $time, RAM_WIDTH, $bits(a.d));
-  assert ($bits(a.a) == $clog2(RAM_DEPTH)) else $fatal(1, "%m %t ERROR: bram $clog2(RAM_DEPTH) (%d) does not match interface a (%d)", $time, $clog2(RAM_DEPTH), $bits(a.a));
+  assert ($bits(a.a) == RAM_DEPTH) else $fatal(1, "%m %t ERROR: bram $clog2(RAM_DEPTH) (%d) does not match interface a (%d)", $time, (RAM_DEPTH), $bits(a.a));
  assert ($bits(b.d) == RAM_WIDTH) else $fatal(1, "%m %t ERROR: bram RAM_WIDTH (%d) does not match interface b (%d)", $time, RAM_WIDTH, $bits(b.d));
-  assert ($bits(b.a) == $clog2(RAM_DEPTH)) else $fatal(1, "%m %t ERROR: bram $clog2(RAM_DEPTH) (%d) does not match interface b (%d)", $time, $clog2(RAM_DEPTH), $bits(b.a));
+  assert ($bits(b.a) == RAM_DEPTH) else $fatal(1, "%m %t ERROR: bram $clog2(RAM_DEPTH) (%d) does not match interface b (%d)", $time, (RAM_DEPTH), $bits(b.a));
 end
-
-xilinx_ultraram_true_dual_port #(
-  .AWIDTH ( $clog2(RAM_DEPTH) ),
-  .DWIDTH ( RAM_WIDTH ),
-  .NBPIPE ( PIPELINES )
-)
-uram_instance (
-  .addra(a.a),
-  .addrb(b.a),
-  .dina(a.d),
-  .dinb(b.d),
-  .clk(a.i_clk),
-  .wea(a.we),
-  .web(b.we),
-  .mem_ena(a.en),
-  .mem_enb(b.en),
-  .rsta(a.i_rst),
-  .rstb(b.i_rst),
-  .regcea(a.re),
-  .regceb(b.re),
-  .douta(a.q),
-  .doutb(b.q)
- );
+ 
+ // xilinx_ultraram_true_dual_port
+ xilinx_ultraram_true_dual_port_bytewrite #(
+   .AWIDTH ( RAM_DEPTH ),
+   .DWIDTH ( RAM_WIDTH ),
+   .NBPIPE ( PIPELINES ),
+   .NUM_COL ( $bits(a.we) )
+ )
+ uram_instance (
+   .addra(a.a),
+   .addrb(b.a),
+   .dina(a.d),
+   .dinb(b.d),
+   .clk(a.i_clk),
+   .wea(a.we),
+   .web(b.we),
+   .mem_ena(a.en),
+   .mem_enb(b.en),
+   .rsta(a.i_rst),
+   .rstb(b.i_rst),
+   .regcea(a.re),
+   .regceb(b.re),
+   .douta(a.q),
+   .doutb(b.q)
+  );

 endmodule

-module xilinx_ultraram_true_dual_port #(
-  parameter AWIDTH = 12,  // Address Width
-  parameter DWIDTH = 72,  // Data Width
-  parameter NBPIPE = 3    // Number of pipeline Registers
- ) (
-  input clk,                    // Clock
-  // Port A
-  input rsta,                   // Reset
-  input wea,                    // Write Enable
-  input regcea,                 // Output Register Enable
-  input mem_ena,                // Memory Enable
-  input [DWIDTH-1:0] dina,      // Data Input
-  input [AWIDTH-1:0] addra,     // Address Input
-  output reg [DWIDTH-1:0] douta,// Data Output
+module xilinx_ultraram_true_dual_port_bytewrite #(
+  parameter AWIDTH  = 12,  // Address Width
+  parameter NUM_COL = 9,   // Number of columns
+  parameter DWIDTH  = 72,  // Data Width, (Byte * NUM_COL) 
+  parameter NBPIPE  = 3    // Number of pipeline Registers
+ ) ( 
+    input clk,                    // Clock
+    // Port A
+    input rsta,                   // Reset
+    input [NUM_COL-1:0] wea,      // Write Enable
+    input regcea,                 // Output Register Enable
+    input mem_ena,                // Memory Enable
+    input [DWIDTH-1:0] dina,      // Data Input  
+    input [AWIDTH-1:0] addra,     // Address Input
+    output reg [DWIDTH-1:0] douta,// Data Output

-  // Port B
-  input rstb,                   // Reset
-  input web,                    // Write Enable
-  input regceb,                 // Output Register Enable
-  input mem_enb,                // Memory Enable
-  input [DWIDTH-1:0] dinb,      // Data Input
-  input [AWIDTH-1:0] addrb,     // Address Input
-  output reg [DWIDTH-1:0] doutb // Data Output
- );
+    // Port B
+    input rstb,                   // Reset
+    input [NUM_COL-1:0] web,      // Write Enable
+    input regceb,                 // Output Register Enable
+    input mem_enb,                // Memory Enable
+    input [DWIDTH-1:0] dinb,      // Data Input  
+    input [AWIDTH-1:0] addrb,     // Address Input
+    output reg [DWIDTH-1:0] doutb // Data Output
+   );

 (* ram_style = "ultra" *)
 reg [DWIDTH-1:0] mem[(1<<AWIDTH)-1:0];        // Memory Declaration

-reg [DWIDTH-1:0] memrega;
+reg [DWIDTH-1:0] memrega;              
 reg [DWIDTH-1:0] mem_pipe_rega[NBPIPE-1:0];    // Pipelines for memory
-reg mem_en_pipe_rega[NBPIPE:0];                // Pipelines for memory enable
+reg mem_en_pipe_rega[NBPIPE:0];                // Pipelines for memory enable  

-reg [DWIDTH-1:0] memregb;
+reg [DWIDTH-1:0] memregb;              
 reg [DWIDTH-1:0] mem_pipe_regb[NBPIPE-1:0];    // Pipelines for memory
-reg mem_en_pipe_regb[NBPIPE:0];                // Pipelines for memory enable
+reg mem_en_pipe_regb[NBPIPE:0];                // Pipelines for memory enable  

 integer          i;
+localparam CWIDTH = DWIDTH/NUM_COL;

 // RAM : Read has one latency, Write has one latency as well.
 always @ (posedge clk)
 begin
- if(mem_ena)
+ if(mem_ena) 
  begin
-   if(wea)
-    mem[addra] <= dina;
-   else
-    memrega <= mem[addra];
-  end
+  for(i = 0;i<NUM_COL;i=i+1) 
+	 if(wea[i])
+    mem[addra][i*CWIDTH +: CWIDTH] <= dina[i*CWIDTH +: CWIDTH];
+  end     
 end
+
+always @ (posedge clk)
+begin
+ if(mem_ena)
+  if(~|wea)
+    memrega <= mem[addra];
+end
+
 // The enable of the RAM goes through a pipeline to produce a
 // series of pipelined enable signals required to control the data
 // pipeline.
@ -175,19 +186,26 @@ begin
  douta <= 0;
 else if (mem_en_pipe_rega[NBPIPE] && regcea)
  douta <= mem_pipe_rega[NBPIPE-1];
-end
+end 

 // RAM : Read has one latency, Write has one latency as well.
 always @ (posedge clk)
 begin
- if(mem_enb)
+ if(mem_enb) 
  begin
-   if(web)
-    mem[addrb] <= dinb;
-   else
-    memregb <= mem[addrb];
-  end
+  for(i=0;i<NUM_COL;i=i+1)
+	 if(web[i])
+    mem[addrb][i*CWIDTH +: CWIDTH] <= dinb[i*CWIDTH +: CWIDTH];
+  end     
 end
+
+always @ (posedge clk)
+begin
+ if(mem_enb)
+  if(~|web)
+    memregb <= mem[addrb];
+end
+
 // The enable of the RAM goes through a pipeline to produce a
 // series of pipelined enable signals required to control the data
 // pipeline.
@ -220,6 +238,7 @@ begin
  doutb <= 0;
 else if (mem_en_pipe_regb[NBPIPE] && regceb)
  doutb <= mem_pipe_regb[NBPIPE-1];
-end
+end 
+
 endmodule

--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@ -21,25 +21,55 @@ package bls12_381_pkg;
  localparam DAT_BITS = 381;
  localparam MUL_BITS = 384;
  localparam [DAT_BITS-1:0] P = 381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab;
-  localparam [DAT_BITS-1:0] Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
-  localparam [DAT_BITS-1:0] Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;

+  typedef logic [DAT_BITS-1:0] fe_t;

+  fe_t Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
+  fe_t Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
+
+  typedef enum logic [2:0] {
+    SCALAR = 0,
+    FE = 1,
+    FE2 = 2,
+    FE12 = 3,
+    FP_AF = 4,
+    FP_JB = 5,
+    FP2_AF = 6,
+    FP2_JB = 7
+  } point_type_t;

  // Jacobian coordinates for Fp elements
  typedef struct packed {
-    logic [DAT_BITS-1:0] x, y, z;
+    fe_t x, y, z;
  } jb_point_t;

+  typedef struct packed {
+    fe_t c1, c0;   
+  } fe2_t;
+
+  fe2_t G2x = '{c0:381'd352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
+                c1:381'd3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758};
+
+  fe2_t G2y = '{c0:381'd1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905,
+                c1:381'd927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582};
+
+  fe2_t FE2_one =  '{c0:381'd1, c1:381'd0};
+
+  jb_point_t g_point = '{x:Gx, y:Gy, z:381'd1};
+
  // Jacobian coordinates for Fp^2 elements
  typedef struct packed {
-    jb_point_t fp1_a, fp1_b;
+    fe2_t x, y, z;
  } fp2_jb_point_t;

+
+  fp2_jb_point_t g2_point = '{x:G2x, y:G2y, z:FE2_one};
+
  // Instruction codes
  typedef enum logic [7:0] {
-    NOOP_WAIT = 8'h0,
-    FP_POINT_MULT = 8'h20
+    NOOP_WAIT       = 8'h0,
+    COPY_REG        = 8'h1,
+    FP_FPOINT_MULT  = 8'h26
  } code_t;

  // Instruction format
@ -48,45 +78,26 @@ package bls12_381_pkg;
    code_t code;
  } inst_t;

-  localparam DATA_RAM_WIDTH = 381;
-  localparam DATA_RAM_DEPTH = $clog2(64);
-  localparam INST_RAM_WIDTH = $bits(inst_t);
-  localparam INST_RAM_DEPTH = $clog2(1024);
+  localparam DATA_RAM_WIDTH = $bits(point_type_t) + DAT_BITS;
+  localparam DATA_RAM_ALIGN_BYTE = 64;
+  localparam DATA_RAM_DEPTH = 8;
+  localparam DATA_RAM_USR_WIDTH = 4;
+  localparam DATA_RAM_USR_DEPTH = DATA_RAM_DEPTH*DATA_RAM_ALIGN_BYTE/DATA_RAM_USR_WIDTH;
+
+  localparam INST_RAM_WIDTH = $bits(inst_t);
+  localparam INST_RAM_ALIGN_BYTE = 8;
+  localparam INST_RAM_DEPTH = 8;
+  localparam INST_RAM_USR_WIDTH = 4;
+  localparam INST_RAM_USR_DEPTH = INST_RAM_DEPTH*INST_RAM_ALIGN_BYTE/INST_RAM_USR_WIDTH;
+
+

-  jb_point_t g_point = '{x:Gx, y:Gy, z:1};

  function is_zero(jb_point_t p);
    is_zero = (p.x == 0 && p.y == 0 && p.z == 1);
    return is_zero;
  endfunction

-   // Function to double point in Jacobian coordinates (for comparison in testbench)
-   // Here a is 0, and we also mod the result
-   function jb_point_t dbl_jb_point(input jb_point_t p);
-     logic signed [1023:0] I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;
-
-     if (p.z == 0) return p;
-
-     I_X = p.x;
-     I_Y = p.y;
-     I_Z = p.z;
-     A = (I_Y*I_Y) % P;
-     B = (((4*I_X) % P)*A) % P;
-     C = (((8*A) % P)*A) % P;
-     D = (((3*I_X)% P)*I_X) % P;
-     X = (D*D)% P;
-     X = X + ((2*B) % P > X ? P : 0) - (2*B) % P;
-
-     Y = (D*((B + (X > B ? P : 0)-X) % P)) % P;
-     Y = Y + (C > Y ? P : 0) - C;
-     Z = (((2*I_Y)% P)*I_Z) % P;
-
-     dbl_jb_point.x = X;
-     dbl_jb_point.y = Y;
-     dbl_jb_point.z = Z;
-     return dbl_jb_point;
-   endfunction
-
   function jb_point_t add_jb_point(jb_point_t p1, p2);
     logic signed [1023:0] A, U1, U2, S1, S2, H, H3, R;

@ -125,7 +136,127 @@ package bls12_381_pkg;
     add_jb_point.y = A + (add_jb_point.y > A ? P : 0) - add_jb_point.y;

   endfunction
+   
+   function fe_t fe_add(fe_t a, b);
+     logic [$bits(fe_t):0] a_, b_;
+     a_ = a;
+     b_ = b;
+     fe_add = a_ + b_ >= P ? a_ + b_ - P : a_ + b_;
+   endfunction   
+   
+   function fe2_t fe2_add(fe2_t a, b);
+     fe2_add.c0 = fe_add(a.c0,b.c0);
+     fe2_add.c1 = fe_add(a.c1,b.c1);
+   endfunction
+   
+   function fe_t fe_sub(fe_t a, b);
+     logic [$bits(fe_t):0] a_, b_;
+     a_ = a;
+     b_ = b;
+     fe_sub = b_ > a_ ? a_- b_ + P : a_ - b_;
+   endfunction  
+   
+   function fe2_t fe2_sub(fe2_t a, b);
+     fe2_sub.c0 = fe_sub(a.c0, b.c0);
+     fe2_sub.c1 = fe_sub(a.c1, b.c1);
+   endfunction  
+  
+   function fe_t fe_mul(fe2_t a, b);
+     fe_mul = (a * b) % P;
+   endfunction  
+    
+   function fe2_t fe2_mul(fe2_t a, b);
+     fe2_mul.c0 = fe_sub(fe_mul(a.c0, b.c0), fe_mul(a.c1, b.c1));
+     fe2_mul.c1 = fe_add(fe_mul(a.c0, b.c1), fe_mul(a.c1, b.c0));
+   endfunction  
+   
+      // Function to double point in Jacobian coordinates (for comparison in testbench)
+   // Here a is 0, and we also mod the result
+   function jb_point_t dbl_jb_point(input jb_point_t p);
+     fe_t I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;

+     if (p.z == 0) return p;
+
+     I_X = p.x;
+     I_Y = p.y;
+     I_Z = p.z;
+     A = fe_mul(I_Y, I_Y);
+     B = fe_mul(fe_mul(4, I_X), A);
+     C = fe_mul(fe_mul(8, A), A);
+     D = fe_mul(fe_mul(3, I_X), I_X);
+     X = fe_mul(D, D);
+     X = fe_sub(X, fe_mul(2, B));
+
+     Y = fe_mul(D, fe_sub(B, X));
+     Y = fe_sub(Y, C);
+     Z = fe_mul(fe_mul(2, I_Y), I_Z);
+
+     dbl_jb_point.x = X;
+     dbl_jb_point.y = Y;
+     dbl_jb_point.z = Z;
+     return dbl_jb_point;
+   endfunction
+   
+   function fp2_jb_point_t dbl_fp2_jb_point(input fp2_jb_point_t p);
+     fe2_t I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;
+
+     if (p.z == 0) return p;
+  
+     I_X = p.x;
+     I_Y = p.y;
+     I_Z = p.z;
+     A = fe2_mul(I_Y, I_Y);
+     B = fe2_mul(fe2_mul(4, I_X), A);
+     C = fe2_mul(fe2_mul(8, A), A);
+     D = fe2_mul(fe2_mul(3, I_X), I_X);
+     X = fe2_mul(D, D);
+     X = fe2_sub(X, fe2_mul(2, B));
+  
+     Y = fe2_mul(D, fe2_sub(B, X));
+     Y = fe2_sub(Y, C);
+     Z = fe2_mul(fe2_mul(2, I_Y), I_Z);
+  
+     dbl_fp2_jb_point.x = X;
+     dbl_fp2_jb_point.y = Y;
+     dbl_fp2_jb_point.z = Z;
+     return dbl_fp2_jb_point;
+   endfunction 
+   
+  function fp2_jb_point_t add_fp2_jb_point(fp2_jb_point_t p1, p2);
+      fe2_t A, U1, U2, S1, S2, H, H3, R;
+ 
+      if (p1.y == p2.y && p1.x == p2.x)
+        return (dbl_fp2_jb_point(p1));
+ 
+      U1 = fe2_mul(p1.x, p2.z);
+      U1 = fe2_mul(U1, p2.z);
+ 
+      U2 = fe2_mul(p2.x, p1.z);
+      U2 = fe2_mul(U2, p1.z);
+      S1 = fe2_mul(p1.y, p2.z);
+      S1 = fe2_mul(fe2_mul(S1, p2.z), p2.z);
+      S2 = fe2_mul(p2.y, p1.z);
+      S2 = fe2_mul(fe2_mul(S2, p1.z), p1.z);
+      H = fe2_sub(U2, U1);
+      R = fe2_sub(S2, S1);
+      H3 = fe2_mul(fe2_mul(H, H), H);
+      A = fe2_mul(fe2_mul(fe2_mul(2, U1), H), H);
+ 
+      add_fp2_jb_point.z = fe2_mul(fe2_mul(H, p1.z), p2.z);
+      add_fp2_jb_point.x = fe2_mul(R, R);
+ 
+      add_fp2_jb_point.x = fe2_add(add_fp2_jb_point.x, H3);
+      add_fp2_jb_point.x = fe2_sub(add_fp2_jb_point.x, A);
+ 
+      A = fe2_mul(fe2_mul(U1, H), H);
+      A = fe2_sub(A, add_fp2_jb_point.x);
+      A = fe2_mul(A, R);
+      add_fp2_jb_point.y = fe2_mul(S1, H3);
+ 
+      add_fp2_jb_point.y = fe2_sub(A, add_fp2_jb_point.y);
+ 
+    endfunction
+   
   function jb_point_t point_mult(logic [DAT_BITS-1:0] c, jb_point_t p);
     jb_point_t result, addend;
     result = 0;
@ -143,12 +274,78 @@ package bls12_381_pkg;
   function on_curve(jb_point_t p);
     return (p.y*p.y - p.x*p.x*p.x - secp256k1_pkg::a*p.x*p.z*p.z*p.z*p.z - secp256k1_pkg::b*p.z*p.z*p.z*p.z*p.z*p.z);
   endfunction
-
+   
+   // Inversion using extended euclidean algorithm
+   function fe_t fe_inv(fe_t a, b = 1);
+      fe_t u, v;
+      logic [$bits(fe_t):0] x1, x2;
+      
+      u = a; v = P;
+      x1 = b; x2 = 0;
+      while (u != 1 && v != 1) begin
+        while (u % 2 == 0) begin
+          u = u / 2;
+          if (x1 % 2 == 0)
+            x1 = x1 / 2;
+          else
+            x1 = (x1 + P) / 2;
+        end
+        while (v % 2 == 0) begin
+          v = v / 2;
+          if (x2 % 2 == 0)
+            x2 = x2 / 2;
+         else
+           x2 = (x2 + P) / 2;      
+        end
+        if (u >= v) begin
+          u = u - v;
+          x1 = fe_sub(x1, x2);
+        end else begin
+          v = v - u;
+          x2 = fe_sub(x2, x1);
+        end
+      end
+      if (u == 1)
+        return x1;
+      else 
+        return x2;
+   endfunction 
+   
+   // This algorithm can also be used for division
+   function fe_t fe_div(fe_t a, b);
+     return fe_inv(a, b);
+   endfunction
+   
+   function fe2_t fe2_inv(fe2_t a);
+     fe_t factor, t0, t1;
+     t0 = fe_mul(a.c0, a.c0);
+     t1 = fe_mul(a.c1, a.c1);
+     factor = fe_inv(fe_add(t0, t1));
+     fe2_inv.c0 = fe_mul(a.c0, factor);
+     fe2_inv.c1 = fe_mul(fe_sub(P, a.c1), factor);
+   endfunction
+   
+   function jb_point_t to_affine(jb_point_t p);
+     fe_t z_;
+     z_ = fe_mul(p.z, p.z);
+     to_affine.z = 1;
+     to_affine.x = fe_mul(p.x, fe_inv(z_));
+     z_ = fe_mul(z_, p.z);
+     to_affine.y = fe_mul(p.x, fe_inv(z_));
+   endfunction
+   
   function print_jb_point(jb_point_t p);
     $display("x:%h", p.x);
     $display("y:%h", p.y);
     $display("z:%h", p.z);
     return;
   endfunction
+   
+   function print_fp2_jb_point(fp2_jb_point_t p);
+     $display("x:(c1:%h, c0:%h)", p.x.c1, p.x.c0);
+     $display("y:(c1:%h, c0:%h)", p.y.c1, p.y.c0);
+     $display("z:(c1:%h, c0:%h)", p.z.c1, p.z.c0);
+     return;
+   endfunction   

 endpackage
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
@ -26,8 +26,8 @@ module bls12_381_top
  // Only tx interface is used to send messages to SW on a SEND-INTERRUPT instruction
  if_axi_stream.source tx_if,
  // User access to the instruction and register RAM
-  if_ram.sink          inst_ram_usr_if,
-  if_ram.sink          data_ram_usr_if,
+  if_axi_mm.sink       inst_usr_if,
+  if_axi_mm.sink       data_usr_if,
  // Configuration memory
  if_axi_mm.sink       cfg_usr_if
 );
@ -35,20 +35,23 @@ module bls12_381_top
 localparam DAT_BITS = bls12_381_pkg::DAT_BITS;

 // Instruction RAM
-localparam INST_READ_CYCLE = 3;
-logic [INST_READ_CYCLE:0] inst_ram_read;
-localparam DATA_READ_CYCLE = 3;
-logic [INST_READ_CYCLE:0] data_ram_read;
+localparam READ_CYCLE = 3;
+logic [READ_CYCLE:0] inst_ram_read, inst_usr_ram_read;
+logic [READ_CYCLE:0] data_ram_read, data_usr_ram_read;
+
 if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_sys_if(.i_clk(i_clk), .i_rst(i_rst));
+if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_usr_if(.i_clk(i_clk), .i_rst(i_rst));
 inst_t curr_inst;

 // Data RAM
-if_ram #(.RAM_WIDTH(bls12_381_pkg::DATA_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::DATA_RAM_DEPTH)) data_ram_sys_if(.i_clk(i_clk), .i_rst(i_rst));
+if_ram #(.RAM_WIDTH(bls12_381_pkg::DATA_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::DATA_RAM_DEPTH), .BYT_EN(48)) data_ram_sys_if(.i_clk(i_clk), .i_rst(i_rst));
+if_ram #(.RAM_WIDTH(bls12_381_pkg::DATA_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::DATA_RAM_DEPTH), .BYT_EN(48)) data_ram_usr_if(.i_clk(i_clk), .i_rst(i_rst));

 // Fp point multiplication
 if_axi_stream #(.DAT_BITS(DAT_BITS*3)) fp_pt_mult_in_if(i_clk);
 if_axi_stream #(.DAT_BITS(DAT_BITS*3)) fp_pt_mult_out_if(i_clk);

+
 logic [DAT_BITS-1:0] k_fp_in;
 logic [7:0] cnt;

@ -74,12 +77,12 @@ always_ff @ (posedge i_clk) begin
    inst_ram_sys_if.re <= 1;
    inst_ram_sys_if.en <= 1;
    inst_ram_read <= inst_ram_read << 1;
-    
+
    data_ram_sys_if.re <= 1;
    data_ram_sys_if.en <= 1;
    data_ram_sys_if.we <= 0;
    data_ram_read <= data_ram_read << 1;
-    
+
    if (fp_pt_mult_in_if.val && fp_pt_mult_in_if.rdy) fp_pt_mult_in_if.val <= 0;
    fp_pt_mult_out_if.rdy <= 1;

@ -87,8 +90,26 @@ always_ff @ (posedge i_clk) begin
      {NOOP_WAIT}: begin
        // Wait in this state
        inst_state <= curr_inst.code;
+        cnt <= 0;
      end
-      {FP_POINT_MULT}: begin
+      {COPY_REG}: begin
+        inst_ram_sys_if.a <= inst_ram_sys_if.a + 1;
+        inst_ram_read[0] <= 1;
+
+        data_ram_sys_if.a <= curr_inst.a;
+        data_ram_read[0] <= 1;
+
+        if (data_ram_read[READ_CYCLE]) begin
+          data_ram_sys_if.a <=  curr_inst.b;
+          data_ram_sys_if.d <= data_ram_sys_if.q;
+          data_ram_sys_if.we <= -1;
+        end
+
+        if (inst_ram_read[READ_CYCLE]) begin
+          inst_state <= curr_inst.code;
+        end
+      end
+      {FP_FPOINT_MULT}: begin
        case(cnt) inside
          0: begin
            data_ram_sys_if.a <= curr_inst.a;
@ -96,9 +117,9 @@ always_ff @ (posedge i_clk) begin
            cnt <= cnt + 1;
          end
          1: begin
-            if (data_ram_read[DATA_READ_CYCLE]) begin
+            if (data_ram_read[READ_CYCLE]) begin
              data_ram_sys_if.a <= curr_inst.b;
-              k_fp_in <= data_ram_sys_if.q;    
+              k_fp_in <= data_ram_sys_if.q;
              fp_pt_mult_in_if.dat <= bls12_381_pkg::g_point;
              fp_pt_mult_in_if.val <= 1;
              data_ram_read[0] <= 1;
@ -110,7 +131,7 @@ always_ff @ (posedge i_clk) begin
            fp_pt_mult_out_if.rdy <= 0;
            if (fp_pt_mult_out_if.val) begin
               data_ram_sys_if.d <= fp_pt_mult_out_if.dat;
-               data_ram_sys_if.we <= 1;
+               data_ram_sys_if.we <= -1;
               cnt <= cnt + 1;
            end
          end
@ -118,40 +139,99 @@ always_ff @ (posedge i_clk) begin
            fp_pt_mult_out_if.rdy <= 0;
            data_ram_sys_if.d <= fp_pt_mult_out_if.dat >> DAT_BITS;
            data_ram_sys_if.a <= data_ram_sys_if.a + 1;
-            data_ram_sys_if.we <= 1;
+            data_ram_sys_if.we <= -1;
            cnt <= cnt + 1;
          end
          4: begin
            data_ram_sys_if.d <= fp_pt_mult_out_if.dat >> (2*DAT_BITS);
-            data_ram_sys_if.we <= 1;
+            data_ram_sys_if.we <= -1;
            data_ram_sys_if.a <= data_ram_sys_if.a + 1;
            cnt <= cnt + 1;
            inst_ram_sys_if.a <= inst_ram_sys_if.a + 1;
            inst_ram_read[0] <= 1;
          end
          5: begin
-            if (inst_ram_read[INST_READ_CYCLE]) begin
+            if (inst_ram_read[READ_CYCLE]) begin
              inst_state <= curr_inst.code;
+              cnt <= 0;
            end
          end
-          
+
        endcase
      end
    endcase
  end
 end

-// Configuration registers
+// Configuration registers, instruction, data RAM
 always_ff @ (posedge i_clk) begin
  if (i_rst) begin
    cfg_usr_if.reset_sink();
-  end else begin
-    cfg_usr_if.rd_dat_val <= 0;
-    if (cfg_usr_if.wr) begin
+    inst_usr_if.reset_sink();
+    data_usr_if.reset_sink();

+    inst_ram_usr_if.reset_source();
+    data_ram_usr_if.reset_source();
+
+    inst_usr_ram_read <= 0;
+    data_usr_ram_read <= 0;
+
+  end else begin
+
+    data_usr_ram_read <= data_usr_ram_read << 1;
+    inst_usr_ram_read <= inst_usr_ram_read << 1;
+
+    cfg_usr_if.rd_dat_val <= 0;
+
+    data_usr_if.rd_dat <= data_ram_usr_if.q;
+    inst_usr_if.rd_dat <= inst_ram_usr_if.q;
+
+    data_usr_if.rd_dat_val <= data_usr_ram_read[READ_CYCLE];
+    inst_usr_if.rd_dat_val <= inst_usr_ram_read[READ_CYCLE];
+
+    inst_ram_usr_if.en <= 1;
+    inst_ram_usr_if.re <= 1;
+    inst_ram_usr_if.we <= 0;
+
+    data_ram_usr_if.en <= 1;
+    data_ram_usr_if.re <= 1;
+    data_ram_usr_if.we <= 0;
+
+    // Write access
+    if (data_usr_if.wr) begin
+      data_ram_usr_if.a <= data_usr_if.addr >> DATA_RAM_ALIGN_BYTE/DATA_RAM_USR_WIDTH;
+      data_ram_usr_if.d <= data_usr_if.wr_dat << (data_usr_if.addr % DATA_RAM_ALIGN_BYTE)*8;
+      data_ram_usr_if.we <= {8{1'd1}}  << (data_usr_if.addr % DATA_RAM_ALIGN_BYTE);
    end
+
+    if (inst_usr_if.wr) begin
+      inst_ram_usr_if.a <= inst_usr_if.addr >> INST_RAM_ALIGN_BYTE/INST_RAM_USR_WIDTH;
+      inst_ram_usr_if.d <= inst_usr_if.wr_dat;
+      inst_ram_usr_if.we <= 1;
+    end
+
+    if (cfg_usr_if.wr) begin
+    // Currently no write supported
+    end
+
+    // Read access
+    if (data_usr_if.rd) begin
+      data_usr_ram_read[0] <= 1;
+      data_ram_usr_if.a <= data_usr_if.addr >> DATA_RAM_ALIGN_BYTE/DATA_RAM_USR_WIDTH;
+    end
+
+    if (inst_usr_if.rd) begin
+      inst_usr_ram_read[0] <= 1;
+      inst_ram_usr_if.a <= inst_usr_if.addr >> INST_RAM_ALIGN_BYTE/INST_RAM_USR_WIDTH;
+    end
+
    if (cfg_usr_if.rd) begin
      cfg_usr_if.rd_dat_val <= 1;
+      case(cfg_usr_if.addr)
+        0: begin
+          cfg_usr_if.rd_dat <= inst_ram_sys_if.a;
+        end
+      endcase
    end
  end
 end
@ -159,7 +239,7 @@ end
 uram_reset #(
  .RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH),
  .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH),
-  .PIPELINES( INST_READ_CYCLE - 2 )
+  .PIPELINES( READ_CYCLE - 2 )
 )
 inst_uram_reset (
  .a ( inst_ram_usr_if ),
@ -169,7 +249,7 @@ inst_uram_reset (
 uram_reset #(
  .RAM_WIDTH(bls12_381_pkg::DATA_RAM_WIDTH),
  .RAM_DEPTH(bls12_381_pkg::DATA_RAM_DEPTH),
-  .PIPELINES( DATA_READ_CYCLE - 2 )
+  .PIPELINES( READ_CYCLE - 2 )
 )
 data_uram_reset (
  .a ( data_ram_usr_if ),
--- a/zcash_fpga/src/tb/bls12_381_top_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_top_tb.sv
@ -36,9 +36,10 @@ initial begin
 end

 if_axi_stream #(.DAT_BYTS(8)) out_if(clk);
-if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_usr_if(.i_clk(clk), .i_rst(rst));
-if_ram #(.RAM_WIDTH(bls12_381_pkg::DATA_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::DATA_RAM_DEPTH)) data_ram_usr_if(.i_clk(clk), .i_rst(rst));
-if_axi_mm #(.D_BITS(64), .A_BITS(8)) cfg_usr_if(clk);
+
+if_axi_mm #(.D_BITS(32), .A_BITS(8)) cfg_usr_if(clk);
+if_axi_mm #(.D_BITS(32), .A_BITS(DATA_RAM_DEPTH * DATA_RAM_ALIGN_BYTE)) data_usr_if(clk);
+if_axi_mm #(.D_BITS(32), .A_BITS(INST_RAM_DEPTH * INST_RAM_ALIGN_BYTE)) inst_usr_if(clk);

 bls12_381_top bls12_381_top (
  .i_clk ( clk ),
@ -46,23 +47,28 @@ bls12_381_top bls12_381_top (
  // Only tx interface is used to send messages to SW on a SEND-INTERRUPT instruction
  .tx_if ( out_if ),
  // User access to the instruction and register RAM
-  .inst_ram_usr_if ( inst_ram_usr_if ),
-  .data_ram_usr_if ( data_ram_usr_if ),
+  .inst_usr_if ( inst_usr_if ),
+  .data_usr_if ( data_usr_if ),
  // Configuration memory
  .cfg_usr_if ( cfg_usr_if )
 );


-task test_0();
+task test_fp_point_mult();
 begin
  integer signed get_len;
  logic [common_pkg::MAX_SIM_BYTS*8-1:0] expected,  get_dat;
  inst_t inst;
+  point_type_t pt;
+  logic [DAT_BITS-1:0] data = 0;
+
  $display("Running test_0...");
-  
-  inst = '{code:FP_POINT_MULT, a:0, b:0, c:0};
-  data_ram_usr_if.write_data(0, 100);
-  inst_ram_usr_if.write_data(0, inst);
+  pt = SCALAR;
+  data = 0;
+  inst = '{code:FP_FPOINT_MULT, a:0, b:0, c:0};
+
+  data_usr_if.put_data_multiple({pt, data}, 0);
+  inst_usr_if.put_data_multiple(inst, 0);

  $display("test_0 PASSED");
 end
@ -70,15 +76,15 @@ endtask;


 initial begin
-  inst_ram_usr_if.reset_source();
-  data_ram_usr_if.reset_source();
+  inst_usr_if.reset_source();
+  data_usr_if.reset_source();
  cfg_usr_if.reset_source();
  #100ns;
  // Wait for memories to reset
  while(!bls12_381_top.inst_uram_reset.reset_done ||
       !bls12_381_top.data_uram_reset.reset_done)
    @(posedge clk);
-    
+
  test_0();

  #1us $finish();