Updates for bls12-381 Fp12 inversion and wrapper files.

2019-08-10 23:31:58 +08:00 · 2019-08-10 23:31:58 +08:00 · 891dd214dc
parent 3faee7b780
commit 891dd214dc
10 changed files with 1769 additions and 69 deletions
--- a/ip_cores/ec/src/rtl/ec_fe12_inv_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe12_inv_s.sv
@ -0,0 +1,318 @@
+/*
+  This provides the interface to perform
+  Fp^12 inverse
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fe12_inv_s
+#(
+  parameter type FE_TYPE,
+  parameter OVR_WRT_BIT = 8       // From this bit 5 bits are used for internal control
+)(
+  input i_clk, i_rst,
+  // Interface to FE2_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe6_if,
+  if_axi_stream.sink   i_mul_fe6_if,
+  // Interface to FE2_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_fe_if,
+  if_axi_stream.sink   i_sub_fe_if,
+  // Interface to FE2_TYPE multiply by non-residue
+  if_axi_stream.source o_mnr_fe6_if,
+  if_axi_stream.sink   i_mnr_fe6_if,
+  // Interface to FE2_TYPE inverse (mod P)
+  if_axi_stream.source o_inv_fe6_if,
+  if_axi_stream.sink   i_inv_fe6_if,
+  // Interface to FE6_TYPE inverse (mod P)
+  if_axi_stream.source o_inv_fe12_if,
+  if_axi_stream.sink   i_inv_fe12_if
+);
+
+localparam NUM_OVR_WRT_BIT = 3;
+
+// Multiplications are calculated using the formula in bls12_381.pkg::fe6_inv()
+FE_TYPE [1:0][5:0] t;
+FE_TYPE [1:0][5:0] a;
+
+logic [7:0] eq_val, eq_wait;
+logic [2:0] mul_cnt, sub_cnt, mnr_cnt, inv_cnt;
+logic mul_en, sub_en, mnr_en, inv_en;
+logic [2:0] nxt_mul, nxt_mnr, nxt_sub, nxt_inv;
+logic [3:0] out_cnt;
+
+logic rdy_l;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_inv_fe12_if.reset_source();
+    o_mnr_fe6_if.reset_source();
+    o_mul_fe6_if.reset_source();
+    o_inv_fe6_if.reset_source();
+    o_sub_fe_if.reset_source();
+    i_inv_fe12_if.rdy <= 0;
+    i_mul_fe6_if.rdy <= 0;
+    i_sub_fe_if.rdy <= 0;
+    i_mnr_fe6_if.rdy <= 0;
+    i_inv_fe6_if.rdy <= 0;
+    eq_val <= 0;
+    eq_wait <= 0;
+    rdy_l <= 0;
+    t <= 0;
+    a <= 0;
+    {out_cnt, mul_cnt, sub_cnt, mnr_cnt, inv_cnt} <= 0;
+    {nxt_mul, nxt_mnr, nxt_sub, nxt_inv} <= 0;
+    {mul_en, sub_en, mnr_en, inv_en} <= 0;
+  end else begin
+
+    i_mul_fe6_if.rdy <= 1;
+    i_inv_fe6_if.rdy <= 1;
+    i_sub_fe_if.rdy <= 1;
+    i_mnr_fe6_if.rdy <= 1;
+
+    if (o_inv_fe12_if.rdy) o_inv_fe12_if.val <= 0;
+    if (o_mul_fe6_if.rdy) o_mul_fe6_if.val <= 0;
+    if (o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
+    if (o_mnr_fe6_if.rdy) o_mnr_fe6_if.val <= 0;
+    if (o_inv_fe6_if.rdy) o_inv_fe6_if.val <= 0;
+
+    if (~sub_en) get_next_sub();
+    if (~mul_en) get_next_mul();
+    if (~mnr_en) get_next_mnr();
+    if (~inv_en) get_next_inv();
+
+    if (rdy_l == 0) i_inv_fe12_if.rdy <= 1;
+
+    if (~o_inv_fe12_if.val || (o_inv_fe12_if.val && o_inv_fe12_if.rdy)) begin
+
+      o_inv_fe12_if.sop <= out_cnt == 0;
+      o_inv_fe12_if.eop <= out_cnt == 11;
+
+      if (eq_val[5] && out_cnt < 6) begin
+        o_inv_fe12_if.val <= 1;
+        out_cnt <= out_cnt + 1;
+        o_inv_fe12_if.dat <= t[1][out_cnt%6];
+      end else
+      if (eq_val[7] && out_cnt >= 6) begin
+        o_inv_fe12_if.val <= 1;
+        out_cnt <= out_cnt + 1;
+        o_inv_fe12_if.dat <= t[0][out_cnt%6];
+      end
+
+      if (out_cnt == 11) begin
+        eq_val <= 0;
+        eq_wait <= 0;
+        rdy_l <= 0;
+        t <= 0;
+        a <= 0;
+        {out_cnt, mul_cnt, sub_cnt, inv_cnt} <= 0;
+        {nxt_mul, nxt_mnr, nxt_sub, nxt_inv} <= 0;
+        {mul_en, sub_en, mnr_en, inv_en} <= 0;
+      end
+    end
+
+    // Latch input
+    if (i_inv_fe12_if.rdy && i_inv_fe12_if.val) begin
+      a <= {i_inv_fe12_if.dat, a[1], a[0][5:1]};
+      if (i_inv_fe12_if.eop) begin
+        i_inv_fe12_if.rdy <= 0;
+        rdy_l <= 1;
+        o_inv_fe6_if.ctl <= i_inv_fe12_if.ctl;
+      end
+    end
+
+    // Check any results from multiplier
+    if (i_mul_fe6_if.val && i_mul_fe6_if.rdy) begin
+      if (i_mul_fe6_if.eop) eq_val[i_mul_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        0: t[0] <= {i_mul_fe6_if.dat, t[0][5:1]};
+        1: t[1] <= {i_mul_fe6_if.dat, t[1][5:1]};
+        5: t[1] <= {i_mul_fe6_if.dat, t[1][5:1]};
+        6: t[0] <= {i_mul_fe6_if.dat, t[0][5:1]};
+        default: o_inv_fe12_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from mnr
+    if (i_mnr_fe6_if.val && i_mnr_fe6_if.rdy) begin
+      if(i_mnr_fe6_if.eop) eq_val[i_mnr_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mnr_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        2: t[1] <= {i_mnr_fe6_if.dat, t[1][5:1]};
+        default: o_inv_fe12_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from sub
+    if (i_sub_fe_if.val && i_sub_fe_if.rdy) begin
+      if(i_sub_fe_if.eop) eq_val[i_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        3: t[0] <= {i_sub_fe_if.dat, t[0][5:1]};
+        7: t[0] <= {i_sub_fe_if.dat, t[0][5:1]};
+        default: o_inv_fe12_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from inv_fe2
+    if (i_inv_fe6_if.val && i_inv_fe6_if.rdy) begin
+      if (i_inv_fe6_if.eop) eq_val[i_inv_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_inv_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        4:  t[0] <= {i_inv_fe6_if.dat, t[0][5:1]};
+        default: o_inv_fe12_if.err <= 1;
+      endcase
+    end
+
+    // Issue new multiplies
+    if (mul_en)
+      case(nxt_mul)
+        0: fe6_multiply(0, a[0], a[0]);
+        1: fe6_multiply(1, a[1], a[1]);
+        5: fe6_multiply(5, a[0], t[0]);
+        6: fe6_multiply(6, a[1], t[0]);
+      endcase
+
+
+    // Issue new sub
+    if (sub_en)
+      case(nxt_sub)
+        3: fe6_subtraction(3, t[0], t[1]);
+        7: fe6_subtraction(7, 0, t[0]);
+      endcase
+
+    // Issue new mnr
+    if (mnr_en)
+      case(nxt_mnr)
+        2: fe6_mnr(2, t[1]);
+      endcase
+
+    // Issue new inv
+    if (inv_en)
+     fe6_inv(4, t[0]);
+
+  end
+end
+
+// Task for subtractions
+task fe6_subtraction(input int unsigned ctl, input FE_TYPE [5:0] a, b);
+  if (~o_sub_fe_if.val || (o_sub_fe_if.val && o_sub_fe_if.rdy)) begin
+    o_sub_fe_if.val <= 1;
+    o_sub_fe_if.sop <= sub_cnt == 0;
+    o_sub_fe_if.eop <= sub_cnt == 5;
+    o_sub_fe_if.dat[0 +: $bits(FE_TYPE)] <= a[sub_cnt];
+    o_sub_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b[sub_cnt];
+    o_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    sub_cnt <= sub_cnt + 1;
+    if (sub_cnt == 5) begin
+      sub_cnt <= 0;
+      get_next_sub();
+    end
+  end
+endtask
+
+
+// Task for using mult
+task fe6_multiply(input int unsigned ctl, input FE_TYPE [5:0] a, b);
+  if (~o_mul_fe6_if.val || (o_mul_fe6_if.val && o_mul_fe6_if.rdy)) begin
+    o_mul_fe6_if.val <= 1;
+    o_mul_fe6_if.sop <= mul_cnt == 0;
+    o_mul_fe6_if.eop <= mul_cnt == 5;
+    o_mul_fe6_if.dat[0 +: $bits(FE_TYPE)] <= a[mul_cnt];
+    o_mul_fe6_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b[mul_cnt];
+    o_mul_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    mul_cnt <= mul_cnt + 1;
+    if (mul_cnt == 5) begin
+      mul_cnt <= 0;
+      get_next_mul();
+    end
+  end
+endtask
+
+// Task for using mnr
+task fe6_mnr(input int unsigned ctl, input FE_TYPE [5:0] a);
+  if (~o_mnr_fe6_if.val || (o_mnr_fe6_if.val && o_mnr_fe6_if.rdy)) begin
+    o_mnr_fe6_if.val <= 1;
+    o_mnr_fe6_if.sop <= mnr_cnt == 0;
+    o_mnr_fe6_if.eop <= mnr_cnt == 5;
+    o_mnr_fe6_if.dat <= a[mnr_cnt];
+    o_mnr_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    mnr_cnt <= mnr_cnt + 1;
+    if (mnr_cnt == 5) begin
+      mnr_cnt <= 0;
+      get_next_mnr();
+    end
+  end
+endtask
+
+// Task for using inv
+task fe6_inv(input int unsigned ctl, input FE_TYPE [5:0] a);
+  if (~o_inv_fe6_if.val || (o_inv_fe6_if.val && o_inv_fe6_if.rdy)) begin
+    o_inv_fe6_if.val <= 1;
+    o_inv_fe6_if.sop <= inv_cnt == 0;
+    o_inv_fe6_if.eop <= inv_cnt == 5;
+    o_inv_fe6_if.dat <= a[inv_cnt];
+    o_inv_fe6_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    inv_cnt <= inv_cnt + 1;
+    if (inv_cnt == 5) begin
+      inv_cnt <= 0;
+      get_next_inv();
+    end
+  end
+endtask
+
+task get_next_mul();
+  mul_en <= 1;
+  if(~eq_wait[0] && rdy_l)
+    nxt_mul <= 0;
+  else if(~eq_wait[1] && rdy_l)
+    nxt_mul <= 1;
+  else if(~eq_wait[5] && eq_val[4])
+    nxt_mul <= 5;
+  else if(~eq_wait[6] && eq_val[4] && eq_wait[5])
+    nxt_mul <= 6;
+  else
+    mul_en <= 0;
+endtask
+
+
+task get_next_sub();
+  sub_en <= 1;
+  if(~eq_wait[3] && eq_val[0] && eq_val[2])
+    nxt_sub <= 3;
+  else if(~eq_wait[7] && eq_val[6])
+    nxt_sub <= 7;
+  else
+    sub_en <= 0;
+endtask
+
+task get_next_mnr();
+  mnr_en <= 1;
+  if(~eq_wait[2] && eq_val[1])
+    nxt_mnr <= 2;
+  else
+    mnr_en <= 0;
+endtask
+
+task get_next_inv();
+  inv_en <= 1;
+  if(~eq_wait[4] && eq_val[3])
+    inv_en <= 1;
+  else
+    inv_en <= 0;
+endtask
+
+endmodule
--- a/ip_cores/ec/src/rtl/ec_fe2_inv_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe2_inv_s.sv
@ -0,0 +1,231 @@
+/*
+  This provides the interface to perform Fp2 inversion
+
+  Inputs must be interleaved starting at c0 (i.e. clock 0 = {b.c0, a.c0})
+  _s in the name represents the input is a stream starting at c0.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fe2_inv_s
+#(
+  parameter type FE_TYPE,                // Base field element type
+  parameter      OVR_WRT_BIT = 8    // We use 2 bits
+)(
+  input i_clk, i_rst,
+  // Interface to FE2_TYPE inverter (mod P) FE_TYPE data width
+  if_axi_stream.source o_inv_fe2_if,
+  if_axi_stream.sink   i_inv_fe2_if,
+  // Interface to FE_TYPE inverter (mod P) FE_TYPE data width
+  if_axi_stream.source o_inv_fe_if,
+  if_axi_stream.sink   i_inv_fe_if,  
+  // Interface to FE_TYPE mul (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if,
+  // Interface to FE_TYPE add (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_add_fe_if,
+  if_axi_stream.sink   i_add_fe_if,
+  // Interface to FE_TYPE sub (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_sub_fe_if,
+  if_axi_stream.sink   i_sub_fe_if
+);
+
+localparam NUM_OVR_WRT = 2;
+
+FE_TYPE [3:0] t; // Temp storage
+logic [2:0] add_cnt, sub_cnt, inv_cnt, mul_cnt, out_cnt;
+logic start, t_val, t1_sub_val;
+
+// Point addtions are simple additions on each of the Fp elements
+always_comb begin
+  i_inv_fe2_if.rdy = ~start;
+
+  i_inv_fe_if.rdy = start;
+
+  i_add_fe_if.rdy = (~o_inv_fe_if.val || (o_inv_fe_if.val && o_inv_fe_if.rdy));
+
+  i_sub_fe_if.rdy = 1;
+
+  case (i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT]) inside
+    0,1: i_mul_fe_if.rdy = 1;
+    2,3: i_mul_fe_if.rdy = (~o_inv_fe2_if.val || (o_inv_fe2_if.val && o_inv_fe2_if.rdy));
+    default: i_mul_fe_if.rdy = 0;
+  endcase
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_inv_fe2_if.reset_source();
+    o_add_fe_if.reset_source();
+    o_sub_fe_if.reset_source();
+    o_mul_fe_if.reset_source();
+    o_inv_fe_if.reset_source();
+    t <= 0;
+    t_val <= 0;
+    t1_sub_val <= 0;
+    {add_cnt, sub_cnt, inv_cnt, mul_cnt} <= 0;
+    start <= 0;
+  end else begin
+
+    if (o_inv_fe2_if.rdy) o_inv_fe2_if.val <= 0;
+    if (o_add_fe_if.rdy) o_add_fe_if.val <= 0;
+    if (o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
+    if (o_mul_fe_if.rdy) o_mul_fe_if.val <= 0;
+    if (o_inv_fe_if.rdy) o_inv_fe_if.val <= 0;
+
+
+    if (i_inv_fe2_if.val && i_inv_fe2_if.rdy) begin
+      if(i_inv_fe2_if.eop) start <= 1;
+      if(i_inv_fe2_if.sop) begin
+        o_inv_fe2_if.ctl <= i_inv_fe2_if.ctl;
+      end
+      t[1:0] <= {i_inv_fe2_if.dat, t[1]}; // Latch input
+    end
+
+
+    // Latch t0 and t1
+    if (i_mul_fe_if.val && i_mul_fe_if.rdy && i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] == 0) begin
+      t[2] <= i_mul_fe_if.dat;
+    end
+
+    if (i_mul_fe_if.val && i_mul_fe_if.rdy && i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] == 1) begin
+      t[3] <= i_mul_fe_if.dat;
+      t_val <= 1;
+    end
+
+    if (i_inv_fe_if.val && i_inv_fe_if.rdy) begin
+      t[2] <= i_inv_fe_if.dat;
+      t_val <= 1;
+    end
+
+    if (i_sub_fe_if.val && i_sub_fe_if.rdy) begin
+      t[1] <= i_sub_fe_if.dat;
+      t1_sub_val <= 1;
+    end
+
+    // Issue new operations
+    case (mul_cnt) inside
+      0: fe_mul(start, t[0], t[0]);
+      1: fe_mul(1, t[1], t[1]);
+      2: fe_mul(inv_cnt >= 1 && t_val, t[0], t[2]);
+      3: fe_mul(t1_sub_val, t[1], t[2]);
+    endcase
+
+     case (add_cnt) inside
+      0: begin
+        fe_add(t_val, t[2], t[3]);
+        if (t_val) t_val <= 0;
+      end
+    endcase
+
+     case (inv_cnt) inside
+      0: begin
+        fe_inv(i_add_fe_if.val, i_add_fe_if.dat);
+      end
+    endcase
+
+     case (sub_cnt) inside
+      0: begin
+        fe_sub(add_cnt >= 1, 0, t[1]);
+      end
+    endcase
+
+    // Final output flow
+    if (~o_inv_fe2_if.val || (o_inv_fe2_if.val && o_inv_fe2_if.rdy)) begin
+      o_inv_fe2_if.sop <= out_cnt == 0;
+      o_inv_fe2_if.eop <= out_cnt == 1;
+      case (out_cnt) inside
+        0: begin
+          o_inv_fe2_if.dat <= i_mul_fe_if.dat;
+          if (i_mul_fe_if.val && i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] == 2) begin
+            o_inv_fe2_if.val <= 1;
+            out_cnt <= out_cnt + 1;
+          end
+        end
+        1: begin
+          o_inv_fe2_if.dat <= i_mul_fe_if.dat;
+          if (i_mul_fe_if.val) begin
+            o_inv_fe2_if.val <= 1;
+            out_cnt <= out_cnt + 1;
+          end
+        end
+        default: begin
+          t <= 0;
+          inv_cnt <= 0;
+          mul_cnt <= 0;
+          add_cnt <= 0;
+          sub_cnt <= 0;
+          out_cnt <= 0;
+          start <= 0;
+          t_val <= 0;
+          t1_sub_val <= 0;
+        end
+      endcase
+    end
+  end
+end
+
+
+// Task for fe_mul
+task fe_mul(input logic val, input logic [$bits(FE_TYPE)-1:0] a, b);
+  if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+    o_mul_fe_if.sop <= 1;
+    o_mul_fe_if.eop <= 1;
+    o_mul_fe_if.dat <= {b, a};
+    o_mul_fe_if.val <= val;
+    o_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] <= mul_cnt;
+    if(val) mul_cnt <= mul_cnt + 1;
+  end
+endtask
+
+// Task for fe_add
+task  fe_add(input logic val, input logic [$bits(FE_TYPE)-1:0] a, b);
+  if (~o_add_fe_if.val || (o_add_fe_if.val && o_add_fe_if.rdy)) begin
+    o_add_fe_if.sop <= 1;
+    o_add_fe_if.eop <= 1;
+    o_add_fe_if.dat <= {b, a};
+    o_add_fe_if.val <= val;
+    o_add_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] <= add_cnt;
+    if(val) add_cnt <= add_cnt + 1;
+  end
+endtask
+
+// Task for fe_sub
+task fe_sub(input logic val, input logic [$bits(FE_TYPE)-1:0] a, b);
+  if (~o_sub_fe_if.val || (o_sub_fe_if.val && o_sub_fe_if.rdy)) begin
+    o_sub_fe_if.sop <= 1;
+    o_sub_fe_if.eop <= 1;
+    o_sub_fe_if.dat <= {b, a};
+    o_sub_fe_if.val <= val;
+    o_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] <= sub_cnt;
+    if(val) sub_cnt <= sub_cnt + 1;
+  end
+endtask
+
+// Task for fe_inv
+task fe_inv(input logic val, input logic [$bits(FE_TYPE)-1:0] a);
+  if (~o_inv_fe_if.val || (o_inv_fe_if.val && o_inv_fe_if.rdy)) begin
+    o_inv_fe_if.sop <= 1;
+    o_inv_fe_if.eop <= 1;
+    o_inv_fe_if.dat <= a;
+    o_inv_fe_if.val <= val;
+    o_inv_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT] <= inv_cnt;
+    if(val) inv_cnt <= inv_cnt + 1;
+  end
+endtask
+
+
+endmodule
--- a/ip_cores/ec/src/rtl/ec_fe6_inv_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe6_inv_s.sv
@ -0,0 +1,418 @@
+/*
+  This provides the interface to perform
+  Fp^6 inverse
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fe6_inv_s
+#(
+  parameter type FE_TYPE,
+  parameter type FE2_TYPE,
+  parameter OVR_WRT_BIT = 8       // From this bit 5 bits are used for internal control
+)(
+  input i_clk, i_rst,
+  // Interface to FE2_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE2_TYPE adder (mod P)
+  if_axi_stream.source o_add_fe_if,
+  if_axi_stream.sink   i_add_fe_if,
+  // Interface to FE2_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_fe_if,
+  if_axi_stream.sink   i_sub_fe_if,
+  // Interface to FE2_TYPE multiply by non-residue
+  if_axi_stream.source o_mnr_fe2_if,
+  if_axi_stream.sink   i_mnr_fe2_if,
+  // Interface to FE2_TYPE inverse (mod P)
+  if_axi_stream.source o_inv_fe2_if,
+  if_axi_stream.sink   i_inv_fe2_if,
+  // Interface to FE6_TYPE inverse (mod P)
+  if_axi_stream.source o_inv_fe6_if,
+  if_axi_stream.sink   i_inv_fe6_if
+);
+
+localparam NUM_OVR_WRT_BIT = 5;
+
+// Multiplications are calculated using the formula in bls12_381.pkg::fe6_inv()
+FE2_TYPE [5:0] t;
+FE2_TYPE [2:0] a;
+
+logic [21:0] eq_val, eq_wait;
+logic mul_cnt, add_cnt, sub_cnt, mnr_cnt, inv_cnt;
+logic mul_en, add_en, sub_en, mnr_en, inv_en;
+logic [4:0] nxt_fe2_mul, nxt_fe2_mnr, nxt_fe_add, nxt_fe_sub, nxt_fe2_inv;
+logic [2:0] out_cnt;
+
+logic rdy_l;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_inv_fe6_if.reset_source();
+    o_mnr_fe2_if.reset_source();
+    o_mul_fe2_if.reset_source();
+    o_inv_fe2_if.reset_source();
+    o_sub_fe_if.reset_source();
+    o_add_fe_if.reset_source();
+    i_inv_fe6_if.rdy <= 0;
+    i_mul_fe2_if.rdy <= 0;
+    i_sub_fe_if.rdy <= 0;
+    i_add_fe_if.rdy <= 0;
+    i_mnr_fe2_if.rdy <= 0;
+    i_inv_fe2_if.rdy <= 0;
+    eq_val <= 0;
+    eq_wait <= 0;
+    rdy_l <= 0;
+    t <= 0;
+    a <= 0;
+    {out_cnt, mul_cnt, add_cnt, sub_cnt, mnr_cnt, inv_cnt} <= 0;
+    {nxt_fe2_mul, nxt_fe2_mnr, nxt_fe_add, nxt_fe_sub, nxt_fe2_inv} <= 0;
+    {mul_en, add_en, sub_en, mnr_en, inv_en} <= 0;
+  end else begin
+
+    i_mul_fe2_if.rdy <= 1;
+    i_inv_fe2_if.rdy <= 1;
+    i_sub_fe_if.rdy <= 1;
+    i_add_fe_if.rdy <= 1;
+    i_mnr_fe2_if.rdy <= 1;
+
+    if (o_inv_fe6_if.rdy) o_inv_fe6_if.val <= 0;
+    if (o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+    if (o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
+    if (o_add_fe_if.rdy) o_add_fe_if.val <= 0;
+    if (o_mnr_fe2_if.rdy) o_mnr_fe2_if.val <= 0;
+    if (o_inv_fe2_if.rdy) o_inv_fe2_if.val <= 0;
+
+    if (~sub_en) get_next_sub();
+    if (~add_en) get_next_add();
+    if (~mul_en) get_next_fe2_mul();
+    if (~mnr_en) get_next_fe2_mnr();
+    if (~inv_en) get_next_fe2_inv();
+
+    if (rdy_l == 0) i_inv_fe6_if.rdy <= 1;
+
+    if (~o_inv_fe6_if.val || (o_inv_fe6_if.val && o_inv_fe6_if.rdy)) begin
+
+      o_inv_fe6_if.sop <= out_cnt == 0;
+      o_inv_fe6_if.eop <= out_cnt == 5;
+
+      if (eq_val[19] && out_cnt/2 == 0) begin
+        o_inv_fe6_if.val <= 1;
+        out_cnt <= out_cnt + 1;
+        o_inv_fe6_if.dat <= t[3][out_cnt%2];
+      end
+      if (eq_val[20] && out_cnt/2 == 1) begin
+        o_inv_fe6_if.val <= 1;
+        out_cnt <= out_cnt + 1;
+        o_inv_fe6_if.dat <= t[4][out_cnt%2];
+      end
+      if (eq_val[21] && out_cnt/2 == 2) begin
+        o_inv_fe6_if.val <= 1;
+        out_cnt <= out_cnt + 1;
+        o_inv_fe6_if.dat <= t[5][out_cnt%2];
+      end
+
+      if (out_cnt == 5) begin
+        eq_val <= 0;
+        eq_wait <= 0;
+        rdy_l <= 0;
+        t <= 0;
+        a <= 0;
+        {out_cnt, mul_cnt, add_cnt, sub_cnt, inv_cnt} <= 0;
+        {nxt_fe2_mul, nxt_fe_add, nxt_fe_sub, nxt_fe2_mnr, nxt_fe2_inv} <= 0;
+        {mul_en, add_en, sub_en, mnr_en, inv_en} <= 0;
+      end
+    end
+
+    // Latch input
+    if (i_inv_fe6_if.rdy && i_inv_fe6_if.val) begin
+      a <= {i_inv_fe6_if.dat, a[2:1], a[0][1]};
+      if (i_inv_fe6_if.eop) begin
+        i_inv_fe6_if.rdy <= 0;
+        rdy_l <= 1;
+        o_inv_fe6_if.ctl <= i_inv_fe6_if.ctl;
+      end
+    end
+
+    // Check any results from multiplier
+    if (i_mul_fe2_if.val && i_mul_fe2_if.rdy) begin
+      if (i_mul_fe2_if.eop) eq_val[i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        1: t[3][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        3: t[0][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        5: t[4][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        7: t[2][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        9: t[5][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        10: t[2][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        12: t[0][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        13: t[1][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        16: t[0][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        19: t[3][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        20: t[4][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        21: t[5][i_mul_fe2_if.eop] <= i_mul_fe2_if.dat;
+        default: o_inv_fe6_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from mnr
+    if (i_mnr_fe2_if.val && i_mnr_fe2_if.rdy) begin
+      if(i_mnr_fe2_if.eop) eq_val[i_mnr_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mnr_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        0: t[3][i_mnr_fe2_if.eop] <= i_mnr_fe2_if.dat;
+        6: t[4][i_mnr_fe2_if.eop] <= i_mnr_fe2_if.dat;
+        15: t[1][i_mnr_fe2_if.eop] <= i_mnr_fe2_if.dat;
+        default: o_inv_fe6_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from sub
+    if (i_sub_fe_if.val && i_sub_fe_if.rdy) begin
+      if(i_sub_fe_if.eop) eq_val[i_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        2: t[3][i_sub_fe_if.eop] <= i_sub_fe_if.dat;
+        8: t[4][i_sub_fe_if.eop] <= i_sub_fe_if.dat;
+        11: t[5][i_sub_fe_if.eop] <= i_sub_fe_if.dat;
+        default: o_inv_fe6_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from add
+    if (i_add_fe_if.val && i_add_fe_if.rdy) begin
+      if (i_add_fe_if.eop) eq_val[i_add_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_add_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        4: t[3][i_add_fe_if.eop] <= i_add_fe_if.dat;
+        14: t[1][i_add_fe_if.eop] <= i_add_fe_if.dat;
+        17: t[1][i_add_fe_if.eop] <= i_add_fe_if.dat;
+        default: o_inv_fe6_if.err <= 1;
+      endcase
+    end
+
+    // Check any results from inv_fe2
+    if (i_inv_fe2_if.val && i_inv_fe2_if.rdy) begin
+      if (i_inv_fe2_if.eop) eq_val[i_inv_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_inv_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        18: t[1][i_inv_fe2_if.eop] <= i_inv_fe2_if.dat;
+        default: o_inv_fe6_if.err <= 1;
+      endcase
+    end
+
+    // Issue new multiplies
+    if (mul_en)
+      case(nxt_fe2_mul)
+        1: fe2_multiply(1, t[3], a[1]);
+        3: fe2_multiply(3, a[0], a[0]);
+        5: fe2_multiply(5, a[2], a[2]);
+        7: fe2_multiply(7, a[0], a[1]);
+        9: fe2_multiply(9, a[1], a[1]);
+        10: fe2_multiply(10, a[2], a[0]);
+        12: fe2_multiply(12, a[2], t[4]);
+        13: fe2_multiply(13, a[1], t[5]);
+        16: fe2_multiply(16, a[0], t[3]);
+        19: fe2_multiply(19, t[3], t[1]);
+        20: fe2_multiply(20, t[4], t[1]);
+        21: fe2_multiply(21, t[5], t[1]);
+      endcase
+
+    // Issue new adds
+    if (add_en)
+      case(nxt_fe_add)
+        4: fe2_addition(4, t[0], t[3]);
+        14: fe2_addition(14, t[0], t[1]);
+        17: fe2_addition(17, t[1], t[0]);
+      endcase
+
+    // Issue new sub
+    if (sub_en)
+      case(nxt_fe_sub)
+        2: fe2_subtraction(2, 0, t[3]);
+        8: fe2_subtraction(8, t[4], t[2]);
+        11: fe2_subtraction(11, t[5], t[2]);
+      endcase
+
+    // Issue new mnr
+    if (mnr_en)
+      case(nxt_fe2_mnr)
+        0: fe2_mnr(0, a[2]);
+        6: fe2_mnr(6, t[4]);
+        15: fe2_mnr(15, t[1]);
+      endcase
+
+    // Issue new inv
+    if (inv_en)
+     fe2_inv(18, t[1]);
+
+  end
+end
+
+// Task for subtractions
+task fe2_subtraction(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_sub_fe_if.val || (o_sub_fe_if.val && o_sub_fe_if.rdy)) begin
+    o_sub_fe_if.val <= 1;
+    o_sub_fe_if.sop <= sub_cnt == 0;
+    o_sub_fe_if.eop <= sub_cnt == 1;
+    o_sub_fe_if.dat[0 +: $bits(FE_TYPE)] <= a[sub_cnt];
+    o_sub_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b[sub_cnt];
+    o_sub_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    if (sub_cnt == 1) begin
+      get_next_sub();
+    end
+    sub_cnt <= sub_cnt + 1;
+  end
+endtask
+
+// Task for addition
+task fe2_addition(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_add_fe_if.val || (o_add_fe_if.val && o_add_fe_if.rdy)) begin
+    o_add_fe_if.val <= 1;
+    o_add_fe_if.sop <= add_cnt == 0;
+    o_add_fe_if.eop <= add_cnt == 1;
+    o_add_fe_if.dat[0 +: $bits(FE_TYPE)] <= a[add_cnt];
+    o_add_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b[add_cnt];
+    o_add_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    if (add_cnt == 1) begin
+      get_next_add();
+    end
+    add_cnt <= add_cnt + 1;
+  end
+endtask
+
+// Task for using mult
+task fe2_multiply(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+    o_mul_fe2_if.val <= 1;
+    o_mul_fe2_if.sop <= mul_cnt == 0;
+    o_mul_fe2_if.eop <= mul_cnt == 1;
+    o_mul_fe2_if.dat[0 +: $bits(FE_TYPE)] <= a[mul_cnt];
+    o_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b[mul_cnt];
+    o_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    if (mul_cnt == 1) begin
+      get_next_fe2_mul();
+    end
+    mul_cnt <= mul_cnt + 1;
+  end
+endtask
+
+// Task for using mnr
+task fe2_mnr(input int unsigned ctl, input FE2_TYPE a);
+  if (~o_mnr_fe2_if.val || (o_mnr_fe2_if.val && o_mnr_fe2_if.rdy)) begin
+    o_mnr_fe2_if.val <= 1;
+    o_mnr_fe2_if.sop <= mnr_cnt == 0;
+    o_mnr_fe2_if.eop <= mnr_cnt == 1;
+    o_mnr_fe2_if.dat <= a[mnr_cnt];
+    o_mnr_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    if (mnr_cnt == 1) begin
+      get_next_fe2_mnr();
+    end
+    mnr_cnt <= mnr_cnt + 1;
+  end
+endtask
+
+// Task for using inv
+task fe2_inv(input int unsigned ctl, input FE2_TYPE a);
+  if (~o_inv_fe2_if.val || (o_inv_fe2_if.val && o_inv_fe2_if.rdy)) begin
+    o_inv_fe2_if.val <= 1;
+    o_inv_fe2_if.sop <= inv_cnt == 0;
+    o_inv_fe2_if.eop <= inv_cnt == 1;
+    o_inv_fe2_if.dat <= a[inv_cnt];
+    o_inv_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+    if (inv_cnt == 1) begin
+      get_next_fe2_inv();
+    end
+    inv_cnt <= inv_cnt + 1;
+  end
+endtask
+
+task get_next_fe2_mul();
+  mul_en <= 1;
+  if(~eq_wait[1] && eq_val[0])
+    nxt_fe2_mul <= 1;
+  else if(~eq_wait[3] && rdy_l)
+    nxt_fe2_mul <= 3;
+  else if(~eq_wait[5] && rdy_l)
+    nxt_fe2_mul <= 5;
+  else if(~eq_wait[7] && rdy_l)
+    nxt_fe2_mul <= 7;
+  else if(~eq_wait[9] && rdy_l)
+    nxt_fe2_mul <= 9;
+  else if(~eq_wait[10] && eq_wait[8] && rdy_l)
+    nxt_fe2_mul <= 10;
+  else if(~eq_wait[12] && eq_val[8] && eq_wait[4])
+    nxt_fe2_mul <= 12;
+  else if(~eq_wait[13] && eq_val[11])
+    nxt_fe2_mul <= 13;
+  else if(~eq_wait[16] && eq_val[4] && eq_wait[14])
+    nxt_fe2_mul <= 16;
+  else if(~eq_wait[19] && eq_val[4] && eq_val[18])
+    nxt_fe2_mul <= 19;
+  else if(~eq_wait[20] && eq_val[8] && eq_val[18])
+    nxt_fe2_mul <= 20;
+  else if(~eq_wait[21] && eq_val[11] && eq_val[18])
+    nxt_fe2_mul <= 21;
+  else
+    mul_en <= 0;
+endtask
+
+
+task get_next_add();
+  add_en <= 1;
+  if(~eq_wait[4] && eq_val[2] && eq_val[3])
+    nxt_fe_add <= 4;
+  else if(~eq_wait[14] && eq_val[12] && eq_val[13])
+    nxt_fe_add <= 14;
+  else if(~eq_wait[17] && eq_val[16] && eq_val[15])
+    nxt_fe_add <= 17;
+  else
+    add_en <= 0;
+endtask
+
+task get_next_sub();
+  sub_en <= 1;
+  if(~eq_wait[2] && eq_val[1])
+    nxt_fe_sub <= 2;
+  else if(~eq_wait[8] && eq_val[6] && eq_val[7])
+    nxt_fe_sub <= 8;
+  else if(~eq_wait[11] && eq_val[9] && eq_val[10])
+    nxt_fe_sub <= 11;
+  else
+    sub_en <= 0;
+endtask
+
+task get_next_fe2_mnr();
+  mnr_en <= 1;
+  if(~eq_wait[0] && rdy_l)
+    nxt_fe2_mnr <= 0;
+  else if(~eq_wait[6] && eq_val[5])
+    nxt_fe2_mnr <= 6;
+  else if(~eq_wait[15] && eq_val[14])
+    nxt_fe2_mnr <= 15;
+  else
+    mnr_en <= 0;
+endtask
+
+task get_next_fe2_inv();
+  inv_en <= 1;
+  if(~eq_wait[18] && eq_val[17])
+    inv_en <= 1;
+  else
+    inv_en <= 0;
+endtask
+
+endmodule
--- a/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
@ -23,7 +23,6 @@ module ec_fe6_mul_s
  parameter type FE_TYPE,
  parameter type FE2_TYPE,
  parameter type FE6_TYPE,
-  parameter CTL_BITS    = 14,
  parameter OVR_WRT_BIT = 8       // From this bit 4 bits are used for internal control, 2 bits for resource sharing - 6 total
 )(
  input i_clk, i_rst,
--- a/ip_cores/util/src/rtl/bin_inv_s.sv
+++ b/ip_cores/util/src/rtl/bin_inv_s.sv
@ -0,0 +1,268 @@
+/*
+  Calculates inversion mod p using binary gcd algorithm.
+
+  Streaming version with internal adder and sub module to improve
+  critical path.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+module bin_inv_s #(
+  parameter P,
+  parameter BITS  = $clog2(P),
+  parameter LEVEL = 1 // Pipelines when adding / subtracting / comparing
+)(
+  input                i_clk,
+  input                i_rst,
+  if_axi_stream.source o_dat_if,
+  if_axi_stream.sink   i_dat_if
+);
+
+logic [BITS:0] x1, x2, u, v;
+logic wait_add;
+logic [1:0] wait_sub;
+logic sub_out;
+
+if_axi_stream #(.DAT_BYTS(2*(BITS+8)/8), .DAT_BITS(2*(BITS+1)), .CTL_BITS(1)) add_i_if (i_clk);
+if_axi_stream #(.DAT_BYTS((BITS+8)/8), .DAT_BITS(BITS+1), .CTL_BITS(1))       add_o_if (i_clk);
+
+if_axi_stream #(.DAT_BYTS(2*(BITS+8)/8), .DAT_BITS(2*(BITS+1)), .CTL_BITS(1)) sub_i_if (i_clk);
+if_axi_stream #(.DAT_BYTS((BITS+8)/8), .DAT_BITS(BITS+1), .CTL_BITS(1))       sub_o_if (i_clk);
+
+enum {IDLE,
+      U_STATE,
+      V_STATE,
+      UPDATE_X1,
+      UPDATE_X2,
+      FINISHED} state;
+      
+always_comb begin
+  add_i_if.dat = 0;
+  add_i_if.dat[BITS+1 +: BITS+1] = P;
+  add_i_if.dat[0 +: BITS+1] = (state == U_STATE) ? x1 : x2;
+  
+  add_i_if.sop = 0;
+  add_i_if.eop = 0;
+  add_i_if.err = 0;
+  add_i_if.mod = 0;
+  add_i_if.ctl = 0;
+  
+  o_dat_if.sop = 1;
+  o_dat_if.eop = 1;
+  o_dat_if.err = 0;
+  o_dat_if.mod = 0;
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    x1 <= 0;
+    x2 <= 0;
+    u <= 0;
+    v <= 0;
+    i_dat_if.rdy <= 0;
+    o_dat_if.val <= 0;
+    o_dat_if.dat <= 0;
+    o_dat_if.ctl <= 0;
+
+    state <= IDLE;
+    add_i_if.val <= 0;
+    add_o_if.rdy <= 0;
+    sub_i_if.reset_source();
+    sub_o_if.rdy <= 0;
+
+    wait_add <= 0;
+    wait_sub <= 0;
+    sub_out <= 0;
+
+  end else begin
+
+    if (o_dat_if.rdy) o_dat_if.val <= 0;
+    if (add_i_if.rdy) add_i_if.val <= 0;
+    if (sub_i_if.rdy) sub_i_if.val <= 0;
+
+    add_o_if.rdy <= 1;
+    sub_o_if.rdy <= 1;
+
+    case(state)
+      IDLE: begin
+        i_dat_if.rdy <= 1;
+        if (i_dat_if.val && i_dat_if.rdy) begin
+          i_dat_if.rdy <= 0;
+          u <= i_dat_if.dat;
+          o_dat_if.ctl <= i_dat_if.ctl;
+          v <= P;
+          x1 <= 1;
+          x2 <= 0;
+          state <= U_STATE;
+        end
+      end
+      U_STATE: begin
+        if (~wait_add) begin
+          if (u % 2 == 1) begin
+            state <= (v % 2 == 1) ? (u >= v) ? UPDATE_X1 : UPDATE_X2 : V_STATE;
+          end else begin
+            u <= u/2;
+            if (x1 % 2 == 0) begin
+              x1 <= x1/2;
+              if ((u/2) % 2 == 1) state <= (v % 2 == 1) ? (u/2 >= v) ? UPDATE_X1 : UPDATE_X2 : V_STATE;
+            end else begin
+              wait_add <= 1;
+              add_i_if.val <= 1;
+            end
+          end
+        end else begin
+          if (add_o_if.val && add_o_if.rdy) begin
+            x1 <= add_o_if.dat/2;
+            wait_add <= 0;
+            if (u % 2 == 1) state <= (v % 2 == 1) ? (u >= v) ? UPDATE_X1 : UPDATE_X2 : V_STATE;
+          end
+        end
+      end
+      V_STATE: begin
+        if (~wait_add) begin
+          if (v % 2 == 1) begin
+            state <= (u >= v) ? UPDATE_X1 : UPDATE_X2;
+          end else begin
+            v <= v/2;
+            if (x2 % 2 == 0) begin
+              x2 <= x2/2;
+              if ((v/2) % 2 == 1) state <= (u >= v/2) ? UPDATE_X1 : UPDATE_X2;
+            end else begin
+              wait_add <= 1;
+              add_i_if.val <= 1;
+            end
+          end
+        end else begin
+          if (add_o_if.val && add_o_if.rdy) begin
+            x2 <= add_o_if.dat/2;
+            wait_add <= 0;
+            if (v % 2 == 1) state <= (u >= v) ? UPDATE_X1 : UPDATE_X2;
+          end
+        end
+      end
+      UPDATE_X1: begin
+          case(wait_sub)
+            0: begin //u <= u - v;
+              sub_i_if.dat[0 +: BITS+1] <= u;
+              sub_i_if.dat[BITS+1 +: BITS+1] <= v;
+              sub_i_if.val <= 1;
+              wait_sub <= wait_sub + 1;
+            end
+            1: begin
+              sub_i_if.dat[0 +: BITS+1] <= x1;
+              sub_i_if.dat[BITS+1 +: BITS+1] <= x2;
+              sub_i_if.val <= 1;
+              wait_sub <= wait_sub + 1;
+            end
+            2: begin
+              // Wait
+            end
+          endcase
+
+          if (sub_o_if.val && sub_o_if.rdy) begin
+            sub_out <= sub_out + 1;          
+            case(sub_out)
+              0: begin
+                u <= sub_o_if.dat;
+                end
+              1: begin
+                x1 <= sub_o_if.dat;
+                wait_sub <= 0;
+                if (u == 1 || v == 1)
+                  state <= FINISHED;
+                else
+                  state <= (u % 2 == 1) ? (v % 2 == 1) ? (u >= v) ? UPDATE_X1 : UPDATE_X2 : V_STATE : U_STATE;
+              end
+            endcase
+          end
+        end
+        UPDATE_X2: begin
+          case(wait_sub)
+            0: begin
+              sub_i_if.dat[0 +: BITS+1] <= v;
+              sub_i_if.dat[BITS+1 +: BITS+1] <= u;
+              sub_i_if.val <= 1;
+              wait_sub <= wait_sub + 1;
+            end
+            1: begin
+              sub_i_if.dat[0 +: BITS+1] <= x2;
+              sub_i_if.dat[BITS+1 +: BITS+1] <= x1;
+              sub_i_if.val <= 1;
+              wait_sub <= wait_sub + 1;
+            end
+            2: begin
+              // Wait
+            end
+          endcase
+
+          if (sub_o_if.val && sub_o_if.rdy) begin
+            sub_out <= sub_out + 1;
+            case(sub_out)
+              0: begin
+                v <= sub_o_if.dat;
+                end
+              1: begin
+                wait_sub <= 0;
+                x2 <= sub_o_if.dat;
+                if (u == 1 || v == 1)
+                  state <= FINISHED;
+                else
+                  state <= (u % 2 == 1) ? (v % 2 == 1) ? (u >= v) ? UPDATE_X1 : UPDATE_X2 : V_STATE : U_STATE;
+              end
+            endcase
+          end
+        end
+      FINISHED: begin
+        o_dat_if.val <= 1;
+        o_dat_if.dat <= (u == 1) ? x1 : x2;
+        if (o_dat_if.val && o_dat_if.rdy) begin
+          o_dat_if.val <= 0;
+          i_dat_if.rdy <= 1;
+          state <= IDLE;
+        end
+      end
+    endcase
+  end
+end
+
+// Adder does not use modulus
+adder_pipe # (
+  .P        ( 0      ),
+  .BITS     ( BITS+1 ),
+  .CTL_BITS ( 1      ),
+  .LEVEL    ( LEVEL  )
+)
+adder_pipe (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_add ( add_i_if ),
+  .o_add ( add_o_if )
+);
+
+subtractor_pipe # (
+  .P        ( P      ),
+  .BITS     ( BITS+1 ),
+  .CTL_BITS ( 1      ),
+  .LEVEL    ( LEVEL  )
+)
+subtractor_pipe (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_sub ( sub_i_if ),
+  .o_sub ( sub_o_if )
+);
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_inv_wrapper.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_inv_wrapper.sv
@ -0,0 +1,313 @@
+/*
+  This does the Fp12 inversion required in the final exponentiation.
+
+  Input is expected to be streamed in with Fp .c0 in the first clock cycle
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_fe12_inv_wrapper
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE  = fe_t,
+  parameter type FE2_TYPE = fe2_t,
+  parameter type FE6_TYPE = fe6_t,
+  parameter      CTL_BITS = 12,
+  parameter      OVR_WRT_BIT = 8 // Need 32 bits for control
+)(
+  input i_clk, i_rst,
+  // Input/Output interfaces for inversion result, FE_TYPE data width
+  if_axi_stream.source o_inv_fe12_if,
+  if_axi_stream.sink   i_inv_fe12_if,
+  // Interface to FE_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [1:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_i_if [1:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if [5:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   add_fe_i_if [5:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if [6:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   sub_fe_i_if [6:0] (i_clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_o_if [2:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe2_i_if [2:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe2_o_if [3:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe2_i_if [3:0] (i_clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe6_o_if       (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe6_i_if       (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_o_if       (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_i_if       (i_clk);
+
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_o_if          (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if          (i_clk);
+
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_o_if         (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_i_if         (i_clk);
+
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe6_o_if         (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe6_i_if         (i_clk);
+
+bin_inv_s #(
+  .P     ( bls12_381_pkg::P ),
+  .LEVEL ( 2                )
+)
+bin_inv_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_dat_if ( inv_fe_i_if ),
+  .i_dat_if ( inv_fe_o_if )
+);
+
+ec_fe2_inv_s #(
+  .FE_TYPE     ( FE_TYPE          ),
+  .OVR_WRT_BIT ( OVR_WRT_BIT      )
+)
+ec_fe2_inv_s(
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_inv_fe2_if ( inv_fe2_i_if   ),
+  .i_inv_fe2_if ( inv_fe2_o_if   ),
+  .o_inv_fe_if  ( inv_fe_o_if    ),
+  .i_inv_fe_if  ( inv_fe_i_if    ),  //
+  .o_mul_fe_if  ( mul_fe_o_if[0] ),
+  .i_mul_fe_if  ( mul_fe_i_if[0] ),
+  .o_add_fe_if  ( add_fe_o_if[0] ),
+  .i_add_fe_if  ( add_fe_i_if[0] ),
+  .o_sub_fe_if  ( sub_fe_o_if[0] ),
+  .i_sub_fe_if  ( sub_fe_i_if[0] )
+);
+
+ec_fe2_mul_s #(
+  .FE_TYPE  ( FE_TYPE  ),
+  .CTL_BITS ( CTL_BITS )
+)
+ec_fe2_mul_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mul_fe2_if ( mul_fe2_i_if[2] ),
+  .i_mul_fe2_if ( mul_fe2_o_if[2] ),
+  .o_add_fe_if ( add_fe_o_if[1] ),
+  .i_add_fe_if ( add_fe_i_if[1] ),
+  .o_sub_fe_if ( sub_fe_o_if[1] ),
+  .i_sub_fe_if ( sub_fe_i_if[1] ),
+  .o_mul_fe_if ( mul_fe_o_if[1] ),
+  .i_mul_fe_if ( mul_fe_i_if[1] )
+);
+
+fe2_mul_by_nonresidue_s #(
+  .FE_TYPE  ( FE_TYPE  )
+)
+fe2_mul_by_nonresidue_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mnr_fe2_if ( mnr_fe2_i_if[3] ),
+  .i_mnr_fe2_if ( mnr_fe2_o_if[3] ),
+  .o_add_fe_if ( add_fe_o_if[2] ),
+  .i_add_fe_if ( add_fe_i_if[2] ),
+  .o_sub_fe_if ( sub_fe_o_if[2] ),
+  .i_sub_fe_if ( sub_fe_i_if[2] )
+);
+
+ec_fe6_inv_s
+#(
+  .FE_TYPE     ( FE_TYPE          ),
+  .FE2_TYPE    ( FE2_TYPE         ),
+  .OVR_WRT_BIT ( OVR_WRT_BIT + 2  )
+)
+ec_fe6_inv_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mul_fe2_if ( mul_fe2_o_if[0] ),
+  .i_mul_fe2_if ( mul_fe2_i_if[0] ),
+  .o_add_fe_if  ( add_fe_o_if[3]  ),
+  .i_add_fe_if  ( add_fe_i_if[3]  ),
+  .o_sub_fe_if  ( sub_fe_o_if[3]  ),
+  .i_sub_fe_if  ( sub_fe_i_if[3]  ),
+  .o_mnr_fe2_if ( mnr_fe2_o_if[0] ),
+  .i_mnr_fe2_if ( mnr_fe2_i_if[0] ),
+  .o_inv_fe2_if ( inv_fe2_o_if    ),
+  .i_inv_fe2_if ( inv_fe2_i_if    ),
+  .o_inv_fe6_if ( inv_fe6_i_if    ),
+  .i_inv_fe6_if ( inv_fe6_o_if    )
+);
+
+ec_fe6_mul_s #(
+  .FE_TYPE  ( FE_TYPE  ),
+  .FE2_TYPE ( FE2_TYPE ),
+  .FE6_TYPE ( FE6_TYPE ),
+  .OVR_WRT_BIT ( OVR_WRT_BIT + 7 )
+)
+ec_fe6_mul_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mul_fe2_if ( mul_fe2_o_if[1] ),
+  .i_mul_fe2_if ( mul_fe2_i_if[1] ),
+  .o_add_fe_if  ( add_fe_o_if[4]  ),
+  .i_add_fe_if  ( add_fe_i_if[4]  ),
+  .o_sub_fe_if  ( sub_fe_o_if[4]  ),
+  .i_sub_fe_if  ( sub_fe_i_if[4]  ),
+  .o_mnr_fe2_if ( mnr_fe2_o_if[1] ),
+  .i_mnr_fe2_if ( mnr_fe2_i_if[1] ),
+  .o_mul_fe6_if ( mul_fe6_i_if    ),
+  .i_mul_fe6_if ( mul_fe6_o_if    )
+);
+
+fe6_mul_by_nonresidue_s #(
+  .FE_TYPE  ( FE_TYPE  )
+)
+fe6_mul_by_nonresidue_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mnr_fe2_if ( mnr_fe2_o_if[2] ),
+  .i_mnr_fe2_if ( mnr_fe2_i_if[2] ),
+  .o_mnr_fe6_if ( mnr_fe6_i_if ),
+  .i_mnr_fe6_if ( mnr_fe6_o_if )
+);
+
+ec_fe12_inv_s #(
+  .FE_TYPE  ( FE_TYPE  ),
+  .OVR_WRT_BIT ( OVR_WRT_BIT + 14 )
+)
+ec_fe12_inv_s (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_mul_fe6_if ( mul_fe6_o_if   ),
+  .i_mul_fe6_if ( mul_fe6_i_if   ),
+  .o_sub_fe_if  ( sub_fe_o_if[5] ),
+  .i_sub_fe_if  ( sub_fe_i_if[5] ),
+  .o_mnr_fe6_if ( mnr_fe6_o_if   ),
+  .i_mnr_fe6_if ( mnr_fe6_i_if   ),
+  .o_inv_fe6_if ( inv_fe6_o_if   ),
+  .i_inv_fe6_if ( inv_fe6_i_if   ),
+  .o_inv_fe12_if ( o_inv_fe12_if ),
+  .i_inv_fe12_if ( i_inv_fe12_if )
+);
+
+adder_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( bls12_381_pkg::P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+adder_pipe (
+  .i_clk ( i_clk          ),
+  .i_rst ( i_rst          ),
+  .i_add ( add_fe_o_if[5] ),
+  .o_add ( add_fe_i_if[5] )
+);
+
+subtractor_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( bls12_381_pkg::P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+subtractor_pipe (
+  .i_clk ( i_clk          ),
+  .i_rst ( i_rst          ),
+  .i_sub ( sub_fe_o_if[6] ),
+  .o_sub ( sub_fe_i_if[6] )
+);
+
+resource_share # (
+  .NUM_IN       ( 5                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 18 ),
+  .PIPELINE_IN  ( 1                ),
+  .PIPELINE_OUT ( 1                )
+)
+resource_share_fe_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( add_fe_o_if[4:0] ),
+  .o_res ( add_fe_o_if[5]   ),
+  .i_res ( add_fe_i_if[5]   ),
+  .o_axi ( add_fe_i_if[4:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 6                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 18 ),
+  .PIPELINE_IN  ( 1                ),
+  .PIPELINE_OUT ( 1                )
+)
+resource_share_fe_sub (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( sub_fe_o_if[5:0] ),
+  .o_res ( sub_fe_o_if[6]   ),
+  .i_res ( sub_fe_i_if[6]   ),
+  .o_axi ( sub_fe_i_if[5:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 18 ),
+  .PIPELINE_IN  ( 1                ),
+  .PIPELINE_OUT ( 1                )
+)
+resource_share_fe_mul (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mul_fe_o_if[1:0] ),
+  .o_res ( o_mul_fe_if      ),
+  .i_res ( i_mul_fe_if      ),
+  .o_axi ( mul_fe_i_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 3                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 24 ),
+  .PIPELINE_IN  ( 1                ),
+  .PIPELINE_OUT ( 1                )
+)
+resource_share_fe2_mnr (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mnr_fe2_o_if[2:0] ),
+  .o_res ( mnr_fe2_o_if[3]   ),
+  .i_res ( mnr_fe2_i_if[3]   ),
+  .o_axi ( mnr_fe2_i_if[2:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 24 ),
+  .PIPELINE_IN  ( 1                ),
+  .PIPELINE_OUT ( 1                )
+)
+resource_share_fe2_mul (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mul_fe2_o_if[1:0] ),
+  .o_res ( mul_fe2_o_if[2]   ),
+  .i_res ( mul_fe2_i_if[2]   ),
+  .o_axi ( mul_fe2_i_if[1:0] )
+);
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
@ -39,21 +39,15 @@ module bls12_381_pairing_wrapper
  if_axi_stream.source o_fe12_if,
  // Interface to FE_TYPE multiplier (mod P)
  if_axi_stream.source o_mul_fe_if,
-  if_axi_stream.sink   i_mul_fe_if,
-  // Interface to FE_TYPE adder (mod P)
-  if_axi_stream.source o_add_fe_if,
-  if_axi_stream.sink   i_add_fe_if,
-  // Interface to FE_TYPE subtractor (mod P)
-  if_axi_stream.source o_sub_fe_if,
-  if_axi_stream.sink   i_sub_fe_if
+  if_axi_stream.sink   i_mul_fe_if
 );

 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [1:0] (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_i_if [1:0] (i_clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if [4:0] (i_clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   add_fe_i_if [4:0] (i_clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if [4:0] (i_clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   sub_fe_i_if [4:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if [5:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   add_fe_i_if [5:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if [5:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   sub_fe_i_if [5:0] (i_clk);

 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_o_if [2:0] (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe2_i_if [2:0] (i_clk);
@ -75,7 +69,7 @@ bls12_381_pairing #(
  .FE12_TYPE   ( FE12_TYPE ),
  .CTL_BITS    ( CTL_BITS  ),
  .OVR_WRT_BIT ( OVR_WRT_BIT + 0 ),// 0 to 15
-  .SQ_BIT      ( OVR_WRT_BIT + 2 ) 
+  .SQ_BIT      ( OVR_WRT_BIT + 2 )
 )
 bls12_381_pairing (
  .i_clk ( i_clk ),
@ -165,7 +159,7 @@ fe6_mul_by_nonresidue_s (
 ec_fe12_mul_s #(
  .FE_TYPE  ( FE_TYPE  ),
  .OVR_WRT_BIT ( OVR_WRT_BIT + 20 ), // 20 to 23
-  .SQ_BIT      ( OVR_WRT_BIT + 2 )   
+  .SQ_BIT      ( OVR_WRT_BIT + 2 )
 )
 ec_fe12_mul_s (
  .i_clk ( i_clk ),
@ -182,6 +176,32 @@ ec_fe12_mul_s (
  .i_mul_fe12_if ( mul_fe12_o_if )
 );

+adder_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( bls12_381_pkg::P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+adder_pipe (
+  .i_clk ( i_clk        ),
+  .i_rst ( i_rst        ),
+  .i_add ( add_fe_o_if[5] ),
+  .o_add ( add_fe_i_if[5] )
+);
+
+subtractor_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( bls12_381_pkg::P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+subtractor_pipe (
+  .i_clk ( i_clk          ),
+  .i_rst ( i_rst          ),
+  .i_sub ( sub_fe_o_if[5] ),
+  .o_sub ( sub_fe_i_if[5] )
+);
+
 resource_share # (
  .NUM_IN       ( 5                ),
  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
@ -194,8 +214,8 @@ resource_share_fe_add (
  .i_clk ( i_clk ),
  .i_rst ( i_rst ),
  .i_axi ( add_fe_o_if[4:0] ),
-  .o_res ( o_add_fe_if      ),
-  .i_res ( i_add_fe_if      ),
+  .o_res ( add_fe_o_if[5]   ),
+  .i_res ( add_fe_i_if[5]   ),
  .o_axi ( add_fe_i_if[4:0] )
 );

@ -211,8 +231,8 @@ resource_share_fe_sub (
  .i_clk ( i_clk ),
  .i_rst ( i_rst ),
  .i_axi ( sub_fe_o_if[4:0] ),
-  .o_res ( o_sub_fe_if      ),
-  .i_res ( i_sub_fe_if      ),
+  .o_res ( sub_fe_o_if[5]   ),
+  .i_res ( sub_fe_i_if[5]   ),
  .o_axi ( sub_fe_i_if[4:0] )
 );

--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@ -467,47 +467,47 @@ package bls12_381_pkg;
   endfunction

   function fe6_t fe6_inv(fe6_t a);
-     fe2_t add_i0, add_i1, sub_i0, mul_i0;
-     fe6_inv[0] = fe2_mul_by_nonresidue(a[2]);
-     fe6_inv[0] = fe2_mul(fe6_inv[0], a[1]);
-     fe6_inv[0] = fe2_sub(0, fe6_inv[0]);
-     add_i0 =  fe2_mul(a[0], a[0]);
-     fe6_inv[0] = fe2_add(add_i0, fe6_inv[0]);
+     fe2_t t0, t1, t2, t3, t4, t5;

-     fe6_inv[1] = fe2_mul(a[2], a[2]);
-     fe6_inv[1] = fe2_mul_by_nonresidue(fe6_inv[1]);
-     sub_i0 = fe2_mul(a[0], a[1]);
-     fe6_inv[1] = fe2_sub(fe6_inv[1], sub_i0);
-
-     fe6_inv[2] = fe2_mul(a[1], a[1]);
-     sub_i0 = fe2_mul(a[2], a[0]);
-     fe6_inv[2] = fe2_sub(fe6_inv[2], sub_i0);
-
-     add_i0 = fe2_mul(a[2], fe6_inv[1]);
-     add_i1 = fe2_mul(a[1], fe6_inv[2]);
-     add_i1 = fe2_add(add_i0, add_i1);
-     add_i1 = fe2_mul_by_nonresidue(add_i1);
-     add_i0 = fe2_mul(a[0], fe6_inv[0]);
-     add_i1 = fe2_add(add_i1, add_i0);
-
-     mul_i0 = fe2_inv(add_i1);
-
-     fe6_inv[0] = fe2_mul(fe6_inv[0], mul_i0);
-     fe6_inv[1] = fe2_mul(fe6_inv[1], mul_i0);
-     fe6_inv[2] = fe2_mul(fe6_inv[2], mul_i0);
+     t3 = fe2_mul_by_nonresidue(a[2]);  // 0. [a]
+     t3 = fe2_mul(t3, a[1]); // 1. [0]
+     t3 = fe2_sub(0, t3); // 2. [1]
+     t0 =  fe2_mul(a[0], a[0]); // 3. [a]
+     t3 = fe2_add(t0, t3); // 4. [2,3]
+     t4 = fe2_mul(a[2], a[2]); // 5. [a]
+     t4 = fe2_mul_by_nonresidue(t4); // 6. [5]
+     t2 = fe2_mul(a[0], a[1]); // 7. [a]
+     t4 = fe2_sub(t4, t2); // 8. [6,7]
+     t5 = fe2_mul(a[1], a[1]); // 9. [a]
+     t2 = fe2_mul(a[2], a[0]); // 10. [a, wait 8]
+     t5 = fe2_sub(t5, t2); // 11. [9, 10]
+     t0 = fe2_mul(a[2], t4); // 12. [8, wait 4]
+     t1 = fe2_mul(a[1], t5); // 13. [11]
+     t1 = fe2_add(t0, t1); // 14. [13, 12]
+     t1 = fe2_mul_by_nonresidue(t1); // 15. [14]
+     t0 = fe2_mul(a[0], t3); // 16. [4, wait 14]
+     t1 = fe2_add(t1, t0); // 17. [16, 15]
+     t1 = fe2_inv(t1); // 18. [17]
+     t3 = fe2_mul(t3, t1); // 19. [18, 4]
+     t4 = fe2_mul(t4, t1); // 20. [18, 8]
+     t5 = fe2_mul(t5, t1); // 21. [18, 11]
+     fe6_inv = {t5, t4, t3};

   endfunction

   function fe12_t fe12_inv(fe12_t a);
-     fe12_t  sub_i0, sub_i1, mul_i0;
-     sub_i0 = fe6_mul(a[0], a[0]);
-     sub_i1 = fe6_mul(a[1], a[1]);
-     sub_i1 = fe6_mul_by_nonresidue(sub_i1);
-     sub_i0 = fe6_sub(sub_i0,sub_i1);
-     sub_i0 = fe6_inv(sub_i0);
-     fe12_inv[0] = fe6_mul(a[0], sub_i0);
-     fe12_inv[1] = fe6_mul(a[1], sub_i0);
-     fe12_inv[1] = fe6_sub(0, fe12_inv[1]);
+     fe6_t  t0, t1;
+     
+     t0 = fe6_mul(a[0], a[0]);    // 0. [a]
+     t1 = fe6_mul(a[1], a[1]); // 1. [a]
+     t1 = fe6_mul_by_nonresidue(t1); // 2. [1]
+     t0 = fe6_sub(t0, t1); // 3. [0, 2]
+     t0 = fe6_inv(t0); // 4. [3]
+     t1 = fe6_mul(a[0], t0); // 5. [4]
+     t0 = fe6_mul(a[1], t0); // 6. [4, wait 5]
+     t0 = fe6_sub(0, t0); // 7. [6]
+     fe12_inv[0] = t1;
+     fe12_inv[1] = t0;
   endfunction

   function fe6_t fe6_add(fe6_t a, b);
@ -571,34 +571,34 @@ package bls12_381_pkg;
     fe6_t aa, bb;
     aa = fe6_mul(a[0], b[0]);  // 0. add_i0 = mul(a[0], b[0])
     bb = fe6_mul(a[1], b[1]);  // 1. bb = mul(a[1], b[1])
-     
+
     fe12_mul[1] = fe6_add(a[1], a[0]); // 2. fe6_mul[1] = add(a[1], a[0])
     fe12_mul[0] = fe6_add(b[0], b[1]);  // 3. fe6_mul[0] = add(b[0], b[1])
-     
+
     fe12_mul[1] = fe6_mul(fe12_mul[1], fe12_mul[0]); // 4. fe6_mul[1] = mul(fe6_mul[1], fe6_mul[0])  [2, 3]
-     
+
     fe12_mul[1] = fe6_sub(fe12_mul[1], aa); // 5. fe6_mul[1] = sub(fe6_mul[1], add_i0) [4, 0]
     fe12_mul[1] = fe6_sub(fe12_mul[1], bb); // 6. fe6_mul[1] = sub(fe6_mul[1], bb) [5, 1]

     bb = fe6_mul_by_nonresidue(bb); // 7. bb = mnr(bb) [6]
-     
+
     fe12_mul[0] = fe6_add(bb, aa); // 8. fe6_mul[0] = add(add_i0, bb) [0, 1, 7]
   endfunction

   function fe12_t fe12_sqr(fe12_t a);
     fe6_t ab, c0c1;
-     
+
     ab = fe6_mul(a[0], a[1]);  // 0.
     c0c1 = fe6_add(a[0], a[1]);  // 1.   (wait eq0)
-     
+
     fe12_sqr[0] = fe6_mul_by_nonresidue(a[1]);
-     
+
     fe12_sqr[0] = fe6_add(fe12_sqr[0], a[0]);
     fe12_sqr[0] = fe6_mul(fe12_sqr[0], c0c1);
-     
+
     fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
     fe12_sqr[1] = fe6_add(ab, ab);
-     
+
     ab = fe6_mul_by_nonresidue(ab);
     fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
   endfunction
@ -821,10 +821,8 @@ package bls12_381_pkg;

     y0 = fe12_mul(r, r);

-
     y1 = fe12_pow(y0, bls_x);

-
     bls_x = bls_x >> 1;
     y2 = fe12_pow(y1, bls_x);
     bls_x = bls_x << 1;
@ -838,8 +836,6 @@ package bls12_381_pkg;
     y2 = fe12_pow(y1, bls_x);
     y3 = fe12_pow(y2, bls_x);

-
-
     y1[1] = fe6_sub(0, y1[1]);
     y3 = fe12_mul(y3, y1);

--- a/zcash_fpga/src/tb/bls12_381_fe12_inv_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fe12_inv_tb.sv
@ -0,0 +1,137 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module bls12_381_fe12_inv_tb ();
+
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+parameter type FE_TYPE   = bls12_381_pkg::fe_t;
+parameter type FE2_TYPE  = bls12_381_pkg::fe2_t;
+parameter type FE6_TYPE  = bls12_381_pkg::fe6_t;
+parameter type FE12_TYPE = bls12_381_pkg::fe12_t;
+parameter P              = bls12_381_pkg::P;
+
+localparam CTL_BITS = 64;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #(CLK_PERIOD/2) clk = ~clk;
+end
+
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) i_inv_fe12_if(clk);
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) o_inv_fe12_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_i_if(clk);
+
+ec_fp_mult_mod #(
+  .P             ( P        ),
+  .KARATSUBA_LVL ( 3        ),
+  .CTL_BITS      ( CTL_BITS )
+)
+ec_fp_mult_mod (
+  .i_clk( clk          ),
+  .i_rst( rst          ),
+  .i_mul ( mul_fe_o_if ),
+  .o_mul ( mul_fe_i_if )
+);
+
+bls12_381_fe12_inv_wrapper #(
+  .FE_TYPE  ( FE_TYPE ),
+  .CTL_BITS ( CTL_BITS ),
+  .OVR_WRT_BIT ( 0 )
+)
+bls12_381_fe12_inv_wrapper (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .o_inv_fe12_if ( i_inv_fe12_if ),
+  .i_inv_fe12_if ( o_inv_fe12_if ),
+  .o_mul_fe_if   ( mul_fe_o_if   ),
+  .i_mul_fe_if   ( mul_fe_i_if   )
+);
+
+
+task test();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in, get_dat;
+  integer start_time, finish_time;
+  FE12_TYPE  f_in, f_out, f_exp;
+  $display("Running test ...");
+
+  for (int lp = 0; lp < 10; lp++) begin
+    $display("Loop %d", lp);
+    dat_in = 0;
+    for (int i = 0; i < 2; i++)
+      for (int j = 0; j < 3; j++)
+        for (int k = 0; k < 2; k++) begin
+          f_in[i][j][k] = random_vector(384/8) % P;
+          dat_in[(i*6+j*2+k)*384 +: $bits(FE_TYPE)] = {f_in[i][j][k]};
+        end
+
+    f_exp = fe12_inv(f_in);
+
+    start_time = $time;
+    fork
+      o_inv_fe12_if.put_stream(dat_in, 12*384/8);
+      i_inv_fe12_if.get_stream(get_dat, get_len);
+    join
+    finish_time = $time;
+
+    for (int i = 0; i < 2; i++)
+      for (int j = 0; j < 3; j++)
+        for (int k = 0; k < 2; k++)
+          f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
+
+    $display("test finished in %d clocks", (finish_time-start_time)/(CLK_PERIOD));
+    
+    if (f_exp != f_out) begin
+      $fatal(1, "%m %t ERROR: output was wrong", $time);
+    end
+    
+    if (fe12_mul(f_out, f_in) != FE12_one) begin
+      $fatal(1, "%m %t ERROR: output did not reduce to one", $time);
+    end
+
+  end
+
+  $display("all tests PASSED");
+end
+endtask;
+
+initial begin
+  o_inv_fe12_if.reset_source();
+  i_inv_fe12_if.rdy = 0;
+  #100ns;
+
+  test();
+
+  #1us $finish();
+end
+
+endmodule
--- a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
@ -92,11 +92,11 @@ subtractor_pipe (
  .o_sub ( sub_fe_i_if )
 );

-ec_fe2_mul #(
+ec_fe2_mul_s #(
  .FE_TYPE  ( FE_TYPE  ),
  .CTL_BITS ( CTL_BITS )
 )
-ec_fe2_mul (
+ec_fe2_mul_s (
  .i_clk ( clk ),
  .i_rst ( rst ),
  .o_mul_fe2_if ( mul_fe2_i_if ),