Add files for calculating the Frobenius map used in ate pairing.

2019-08-01 20:24:46 +08:00 · 2019-08-01 20:24:46 +08:00 · f8371eba2e
parent 3b4693d407
commit f8371eba2e
7 changed files with 963 additions and 27 deletions
--- a/ip_cores/ec/src/rtl/ec_fe2_mul.sv
+++ b/ip_cores/ec/src/rtl/ec_fe2_mul.sv
@ -0,0 +1,164 @@
+/*
+  This provides the interface to perform Fp2 field element mul. Using karabusta algorithm.
+
+  Inputs must be interleaved starting at c0 (i.e. clock 0 = {b.c0, a.c0})
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module ec_fe2_mul
+#(
+  parameter type FE_TYPE,                // Base field element type
+  parameter      CTL_BITS    = 12
+)(
+  input i_clk, i_rst,
+  // Interface to FE(P)_TYPE adder (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE_TYPE mul (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_add_fe_if,
+  if_axi_stream.sink   i_add_fe_if,
+  // Interface to FE_TYPE mul (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_sub_fe_if,
+  if_axi_stream.sink   i_sub_fe_if,
+  // Interface to FE_TYPE mul (mod P) 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+FE_TYPE a, b; // Temp storage
+logic [1:0] mul_cnt, add_sub_cnt;
+logic out_cnt;
+
+// Point addtions are simple additions on each of the Fp elements
+always_comb begin
+  i_mul_fe2_if.rdy = (mul_cnt == 0 || mul_cnt == 1) && (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy));
+  i_mul_fe_if.rdy = (add_sub_cnt == 0 || add_sub_cnt == 1) ? ~o_sub_fe_if.val || (o_sub_fe_if.val && o_sub_fe_if.rdy) : 
+                     ~o_add_fe_if.val || (o_add_fe_if.val && o_add_fe_if.rdy);
+  i_add_fe_if.rdy = out_cnt == 1 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy));
+  i_sub_fe_if.rdy = out_cnt == 0 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy));
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_mul_fe2_if.reset_source();
+    o_add_fe_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_sub_fe_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_mul_fe_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    a <= 0;
+    b <= 0;
+    mul_cnt <= 0;
+    add_sub_cnt <= 0;
+    out_cnt <= 0;
+  end else begin
+
+    if (o_add_fe_if.val && o_add_fe_if.rdy) o_add_fe_if.val <= 0;
+    if (o_sub_fe_if.val && o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
+    if (o_mul_fe_if.val && o_mul_fe_if.rdy) o_mul_fe_if.val <= 0;
+    if (o_mul_fe2_if.val && o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+
+    case(mul_cnt)
+      0: begin
+        if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+          o_mul_fe_if.dat <= i_mul_fe2_if.dat;  // a0 * b0
+          o_mul_fe_if.val <= i_mul_fe2_if.val;
+          o_mul_fe_if.ctl <= i_mul_fe2_if.ctl;
+          {b, a} <= i_mul_fe2_if.dat;        
+          if (i_mul_fe2_if.val) mul_cnt <= mul_cnt + 1;
+        end
+      end
+      1: begin
+        if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+          o_mul_fe_if.dat <= i_mul_fe2_if.dat;  // a1 * b1
+          o_mul_fe_if.val <= i_mul_fe2_if.val;                                            
+          if (i_mul_fe2_if.val) mul_cnt <= mul_cnt + 1;
+        end
+      end
+      2: begin
+        if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+          o_mul_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b; // a1 * b0
+          o_mul_fe_if.val <= 1;
+          mul_cnt <= mul_cnt + 1;
+          b <= o_mul_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)];
+        end
+      end
+      3: begin
+        if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+          o_mul_fe_if.dat <= {a, b};  // b1 * a0
+          o_mul_fe_if.val <= 1;
+          mul_cnt <= 0;
+        end
+      end
+    endcase
+
+
+    case(add_sub_cnt)
+      0: begin
+        if (~o_sub_fe_if.val || (o_sub_fe_if.val && o_sub_fe_if.rdy)) begin
+          o_sub_fe_if.dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; 
+          if (i_mul_fe_if.val) add_sub_cnt <= add_sub_cnt + 1;
+        end
+      end
+      1: begin
+        o_sub_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
+        o_sub_fe_if.ctl <= i_mul_fe_if.ctl; // a0b0 - a1b1
+        if (i_mul_fe_if.val) begin
+          o_sub_fe_if.val <= 1;
+          add_sub_cnt <= add_sub_cnt + 1;
+        end
+      end
+      2: begin
+        if (~o_add_fe_if.val || (o_add_fe_if.val && o_add_fe_if.rdy)) begin
+          o_add_fe_if.dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
+          if (i_mul_fe_if.val) add_sub_cnt <= add_sub_cnt + 1;
+        end
+      end
+      3: begin        
+        o_add_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat;
+        o_add_fe_if.ctl <= i_mul_fe_if.ctl; // a1b0 + a0b1
+        if (i_mul_fe_if.val) begin
+          o_add_fe_if.val <= 1;
+          add_sub_cnt <= add_sub_cnt + 1;
+        end
+      end
+    endcase
+    
+    case(out_cnt)
+      0: begin
+        if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+          o_mul_fe2_if.dat <= i_sub_fe_if.dat;
+          o_mul_fe2_if.sop <= 1;
+          o_mul_fe2_if.eop <= 0;
+          o_mul_fe2_if.ctl <= i_sub_fe_if.ctl;
+          o_mul_fe2_if.val <= i_sub_fe_if.val;
+          if (i_sub_fe_if.val) out_cnt <= out_cnt + 1;
+        end
+      end
+      1: begin
+        if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+          o_mul_fe2_if.dat <= i_add_fe_if.dat;
+          o_mul_fe2_if.sop <= 0;
+          o_mul_fe2_if.eop <= 1;
+          o_mul_fe2_if.ctl <= i_add_fe_if.ctl;
+          o_mul_fe2_if.val <= i_add_fe_if.val;
+          if (i_add_fe_if.val) out_cnt <= out_cnt + 1;
+        end
+      end
+    endcase
+
+  end
+end
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_fmap.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_fmap.sv
@ -0,0 +1,100 @@
+/*
+  This does the for Frobenius map calculation required in final
+  exponentiation in the ate pairing on a Fp^2 element.
+
+  Input is expected to be streamed in with Fp .c0 in the first clock cycle
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_fe12_fmap
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE = fe_t,                     // Base field element type
+  parameter      CTL_BITS    = 12,
+  parameter      CTL_BIT_POW = 8         // This is where we encode the power value with 2 bits - only 0,1,2,3 are supported
+)(
+  input i_clk, i_rst,
+  // Input/Output intefaces for fmap result, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe12_if,
+  if_axi_stream.sink   i_fmap_fe12_if,
+  // Interface to FE6_TYPE fmap block, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe6_if,
+  if_axi_stream.sink   i_fmap_fe6_if,
+  // Interface to FE_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if
+);
+
+logic [4:0] out_cnt, out_cnt1, out_cnt2;
+
+always_comb begin
+  i_fmap_fe12_if.rdy = ~o_fmap_fe6_if.val || (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy);
+  i_fmap_fe6_if.rdy = ~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy);
+  i_mul_fe2_if.rdy = ~o_fmap_fe12_if.val || (o_fmap_fe12_if.val && o_fmap_fe12_if.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_fmap_fe12_if.reset_source();
+    o_fmap_fe6_if.reset_source();
+    o_mul_fe2_if.reset_source();
+    out_cnt <= 0;
+    out_cnt1 <= 0;
+    out_cnt2 <= 0;
+  end else begin
+
+    if (o_fmap_fe12_if.val && o_fmap_fe12_if.rdy) o_fmap_fe12_if.val <= 0;
+    if (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy) o_fmap_fe6_if.val <= 0;
+    if (o_mul_fe2_if.val && o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+
+    if (~o_fmap_fe6_if.val || (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy)) begin
+      o_fmap_fe6_if.val <= i_fmap_fe12_if.val;
+      o_fmap_fe6_if.sop <= out_cnt2 == 0;
+      o_fmap_fe6_if.eop <= out_cnt2 == 5;
+      o_fmap_fe6_if.ctl <= i_fmap_fe12_if.ctl;
+      o_fmap_fe6_if.dat <= i_fmap_fe12_if.dat;
+      out_cnt2 <= i_fmap_fe12_if.val ? out_cnt2 == 5 ? 0 : out_cnt2 + 1 : out_cnt2;
+    end
+
+    if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+      o_mul_fe2_if.val <= i_fmap_fe6_if.val;
+      o_mul_fe2_if.sop <= out_cnt % 2 == 0;
+      o_mul_fe2_if.eop <= out_cnt % 2 == 1;
+      o_mul_fe2_if.ctl <= i_fmap_fe6_if.ctl;
+      case (out_cnt) inside
+        0,1,2,3,4,5: begin
+          o_mul_fe2_if.dat[0 +: $bits(FE_TYPE)] <= i_fmap_fe6_if.dat;
+          o_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <=  out_cnt % 2 == 0 ? 1 : 0;
+        end
+        6,7,8,9,10,11: o_mul_fe2_if.dat <= {FROBENIUS_COEFF_FQ12_C1[i_fmap_fe6_if.ctl[CTL_BIT_POW +: 2]][out_cnt % 2], i_fmap_fe6_if.dat};
+      endcase
+      out_cnt <= i_fmap_fe6_if.val ? out_cnt == 11 ? 0 : out_cnt + 1 : out_cnt;
+    end
+
+
+    if (~o_fmap_fe12_if.val || (o_fmap_fe12_if.val && o_fmap_fe12_if.rdy)) begin
+      o_fmap_fe12_if.val <= i_mul_fe2_if.val;
+      o_fmap_fe12_if.sop <= out_cnt1 == 0;
+      o_fmap_fe12_if.eop <= out_cnt1 == 11;
+      o_fmap_fe12_if.ctl <= i_mul_fe2_if.ctl;
+      o_fmap_fe12_if.dat <= i_mul_fe2_if.dat;
+      out_cnt1 <= i_mul_fe2_if.val ? out_cnt1 == 11 ? 0 : out_cnt1 + 1 : out_cnt1;
+    end
+  end
+end
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_fmap_wrapper.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fe12_fmap_wrapper.sv
@ -0,0 +1,114 @@
+/*
+  This does the for Frobenius map calculation required in final
+  exponentiation in the ate pairing on a Fp^2 element.
+
+  Input is expected to be streamed in with Fp .c0 in the first clock cycle
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_fe12_fmap_wrapper
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE = fe_t,     
+  parameter      CTL_BITS    = 12,
+  parameter      CTL_BIT_POW = 8         // This is where we encode the power value with 2 bits - only 0,1,2,3 are supported - 1 extra bit required after this for control
+)(
+  input i_clk, i_rst,
+  // Input/Output intefaces for fmap result, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe12_if,
+  if_axi_stream.sink   i_fmap_fe12_if,
+  // Interface to FE2_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_if_o [1:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe2_if_i [1:0] (i_clk);
+
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe6_if_o (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe6_if_i (i_clk);
+
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe2_if_o (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe2_if_i (i_clk);
+
+bls12_381_fe2_fmap #(
+  .FE_TYPE     ( FE_TYPE     ),
+  .CTL_BITS    ( CTL_BITS    ),
+  .CTL_BIT_POW ( CTL_BIT_POW )
+)
+bls12_381_fe2_fmap (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_fmap_fe2_if ( fmap_fe2_if_i ),
+  .i_fmap_fe2_if ( fmap_fe2_if_o ),
+  .o_mul_fe_if   ( o_mul_fe_if   ),
+  .i_mul_fe_if   ( i_mul_fe_if   )
+);
+
+bls12_381_fe6_fmap #(
+  .FE_TYPE     ( FE_TYPE     ),
+  .CTL_BITS    ( CTL_BITS    ),
+  .CTL_BIT_POW ( CTL_BIT_POW )
+)
+bls12_381_fe6_fmap (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_fmap_fe6_if ( fmap_fe6_if_i   ),
+  .i_fmap_fe6_if ( fmap_fe6_if_o   ),
+  .o_fmap_fe2_if ( fmap_fe2_if_o   ),
+  .i_fmap_fe2_if ( fmap_fe2_if_i   ),
+  .o_mul_fe2_if  ( mul_fe2_if_o[0] ),
+  .i_mul_fe2_if  ( mul_fe2_if_i[0] )
+);
+
+bls12_381_fe12_fmap #(
+  .FE_TYPE     ( FE_TYPE     ),
+  .CTL_BITS    ( CTL_BITS    ),
+  .CTL_BIT_POW ( CTL_BIT_POW )
+)
+bls12_381_fe12_fmap (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .o_fmap_fe12_if ( o_fmap_fe12_if  ),
+  .i_fmap_fe12_if ( i_fmap_fe12_if  ),
+  .o_fmap_fe6_if  ( fmap_fe6_if_o   ),
+  .i_fmap_fe6_if  ( fmap_fe6_if_i   ),
+  .o_mul_fe2_if   ( mul_fe2_if_o[1] ),
+  .i_mul_fe2_if   ( mul_fe2_if_i[1] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( CTL_BIT_POW+2    ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe2_mul (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mul_fe2_if_o[1:0] ),
+  .o_res ( o_mul_fe2_if      ),
+  .i_res ( i_mul_fe2_if      ),
+  .o_axi ( mul_fe2_if_i[1:0] )
+);
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fe2_fmap.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fe2_fmap.sv
@ -0,0 +1,84 @@
+/*
+  This does the for Frobenius map calculation required in final
+  exponentiation in the ate pairing on a Fp^2 element.
+
+  Input is expected to be streamed in with Fp .c0 in the first clock cycle
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_fe2_fmap 
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE = fe_t,         // Base field element type
+  parameter      CTL_BITS    = 12,
+  parameter      CTL_BIT_POW = 8         // This is where we encode the power value with 2 bits - only 0,1,2,3 are supported
+)(
+  input i_clk, i_rst,
+  // Input/Output intefaces for fmap result, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe2_if,
+  if_axi_stream.sink   i_fmap_fe2_if,
+  // Interface to FE_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+
+always_comb begin
+  i_fmap_fe2_if.rdy = ~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy);
+  i_mul_fe_if.rdy = ~o_fmap_fe2_if.val || (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy);
+end
+
+logic mul_cnt;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_fmap_fe2_if.reset_source();
+    o_mul_fe_if.reset_source();
+    mul_cnt <= 0;
+  end else begin
+
+    if (o_mul_fe_if.val && o_mul_fe_if.rdy) o_mul_fe_if.val <= 0;
+    if (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy) o_fmap_fe2_if.val <= 0;
+
+    if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+      case(mul_cnt) 
+        0: begin
+          o_mul_fe_if.dat[0 +: $bits(FE_TYPE)] <= i_fmap_fe2_if.dat;
+          o_mul_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= 1;
+        end
+        1: begin
+          o_mul_fe_if.dat <= {i_fmap_fe2_if.dat, FROBENIUS_COEFF_FQ2_C1[i_fmap_fe2_if.ctl[CTL_BIT_POW +: 2]]};
+        end
+      endcase
+      o_mul_fe_if.val <= i_fmap_fe2_if.val;
+      o_mul_fe_if.ctl <= i_fmap_fe2_if.ctl;
+      o_mul_fe_if.sop <= 1;
+      o_mul_fe_if.eop <= 1;
+      mul_cnt <= i_fmap_fe2_if.val ? mul_cnt + 1 : mul_cnt;
+    end
+
+    if (~o_fmap_fe2_if.val || (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy)) begin
+      o_fmap_fe2_if.val <= i_mul_fe_if.val;
+      o_fmap_fe2_if.eop <= i_mul_fe_if.val ? o_fmap_fe2_if.sop : o_fmap_fe2_if.eop;
+      o_fmap_fe2_if.sop <= i_mul_fe_if.val ? ~o_fmap_fe2_if.sop : o_fmap_fe2_if.sop;
+      o_fmap_fe2_if.dat <= i_mul_fe_if.dat;
+      o_fmap_fe2_if.ctl <= i_mul_fe_if.ctl;
+    end
+  end
+end
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_fe6_fmap.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_fe6_fmap.sv
@ -0,0 +1,99 @@
+/*
+  This does the for Frobenius map calculation required in final
+  exponentiation in the ate pairing on a Fp^2 element.
+
+  Input is expected to be streamed in with Fp .c0 in the first clock cycle
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_fe6_fmap
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE     = fe_t,     // Base field element type
+  parameter      CTL_BITS    = 12,
+  parameter      CTL_BIT_POW = 8         // This is where we encode the power value with 2 bits - only 0,1,2,3 are supported
+)(
+  input i_clk, i_rst,
+  // Input/Output intefaces for fmap result, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe6_if,
+  if_axi_stream.sink   i_fmap_fe6_if,
+  // Interface to FE2_TYPE fmap block, FE_TYPE data width
+  if_axi_stream.source o_fmap_fe2_if,
+  if_axi_stream.sink   i_fmap_fe2_if,
+  // Interface to FE_TYPE mul (mod P), 2*FE_TYPE data width
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if
+);
+
+logic [2:0] out_cnt, out_cnt1;
+
+always_comb begin
+  i_fmap_fe6_if.rdy = ~o_fmap_fe2_if.val || (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy);
+  i_fmap_fe2_if.rdy = ~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy);
+  i_mul_fe2_if.rdy = ~o_fmap_fe6_if.val || (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy);
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_fmap_fe6_if.reset_source();
+    o_fmap_fe2_if.reset_source();
+    o_mul_fe2_if.reset_source();
+    out_cnt <= 0;
+    out_cnt1 <= 0;
+  end else begin
+
+    if (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy) o_fmap_fe6_if.val <= 0;
+    if (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy) o_fmap_fe2_if.val <= 0;
+    if (o_mul_fe2_if.val && o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+
+    if (~o_fmap_fe2_if.val || (o_fmap_fe2_if.val && o_fmap_fe2_if.rdy)) begin
+      o_fmap_fe2_if.val <= i_fmap_fe6_if.val;
+      o_fmap_fe2_if.dat <= i_fmap_fe6_if.dat;
+      o_fmap_fe2_if.sop <= i_fmap_fe6_if.val ? ~o_fmap_fe2_if.sop : o_fmap_fe2_if.sop;
+      o_fmap_fe2_if.eop <= i_fmap_fe6_if.val ? o_fmap_fe2_if.sop : o_fmap_fe2_if.eop;
+      o_fmap_fe2_if.ctl <= i_fmap_fe6_if.ctl;
+    end
+
+    if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+      o_mul_fe2_if.val <= i_fmap_fe2_if.val;
+      o_mul_fe2_if.sop <= out_cnt % 2 == 0;
+      o_mul_fe2_if.eop <= out_cnt % 2 == 1;
+      o_mul_fe2_if.ctl <= i_fmap_fe2_if.ctl;
+      case (out_cnt) inside
+        0,1: begin
+          o_mul_fe2_if.dat[0 +: $bits(FE_TYPE)] <= i_fmap_fe2_if.dat;
+          o_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= out_cnt == 0 ? 1 : 0;
+        end
+        2,3: o_mul_fe2_if.dat <= {FROBENIUS_COEFF_FQ6_C1[i_fmap_fe2_if.ctl[CTL_BIT_POW +: 2]][out_cnt % 2], i_fmap_fe2_if.dat};
+        4,5: o_mul_fe2_if.dat <= {FROBENIUS_COEFF_FQ6_C2[i_fmap_fe2_if.ctl[CTL_BIT_POW +: 2]][out_cnt % 2], i_fmap_fe2_if.dat};
+      endcase
+      out_cnt <= i_fmap_fe2_if.val ? out_cnt == 5 ? 0 : out_cnt + 1 : out_cnt;
+    end
+
+
+    if (~o_fmap_fe6_if.val || (o_fmap_fe6_if.val && o_fmap_fe6_if.rdy)) begin
+      o_fmap_fe6_if.val <= i_mul_fe2_if.val;
+      o_fmap_fe6_if.sop <= out_cnt1 == 0;
+      o_fmap_fe6_if.eop <= out_cnt1 == 5;
+      o_fmap_fe6_if.ctl <= i_mul_fe2_if.ctl;
+      o_fmap_fe6_if.dat <= i_mul_fe2_if.dat;
+      out_cnt1 <= i_mul_fe2_if.val ? out_cnt1 == 5 ? 0 : out_cnt1 + 1 : out_cnt1;
+    end
+  end
+end
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@ -26,7 +26,7 @@ package bls12_381_pkg;

  fe_t Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
  fe_t Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
-
+  
  localparam [63:0] ATE_X = 64'hd201000000010000;
  localparam ATE_X_START = 63;

@ -71,7 +71,47 @@ package bls12_381_pkg;
  typedef fe_t  [1:0] fe2_t;
  typedef fe2_t [2:0] fe6_t;
  typedef fe6_t [1:0] fe12_t;
+  
+  // These are used in the final exponentiation of the pairing.
+  // We only list coeff needed for powers of 0,1,2,3
+  parameter fe2_t FROBENIUS_COEFF_FQ12_C1 [3:0] = {
+     {381'h06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09, 
+      381'h135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2},
+     {381'h0,
+      381'h00000000000000005f19672fdf76ce51ba69c6076a0f77eaddb3a93be6f89688de17d813620a00022e01fffffffeffff},
+     {381'h00fc3e2b36c4e03288e9e902231f9fb854a14787b6c7b36fec0c8ec971f63c5f282d5ac14d6c7ec22cf78a126ddc4af3,
+      381'h1904d3bf02bb0667c231beb4202c0d1f0fd603fd3cbd5f4f7b2443d784bab9c4f67ea53d63e7813d8d0775ed92235fb8},
+     {381'h0,
+      381'h1}};
+    
+  parameter fe2_t FROBENIUS_COEFF_FQ6_C1 [3:0] = {
+     {381'h1,
+      381'h0},
+     {381'h0,
+      381'h00000000000000005f19672fdf76ce51ba69c6076a0f77eaddb3a93be6f89688de17d813620a00022e01fffffffefffe},
+     {381'h1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac,
+      381'h0},
+     {381'h0,
+      381'h1}};

+
+  parameter fe2_t FROBENIUS_COEFF_FQ6_C2 [3:0] = {
+     {381'h0,
+      381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa},
+     {381'h0,
+      381'h1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac},
+     {381'h0,
+      381'h1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad},
+     {381'h0,
+      381'h1}};
+      
+  parameter fe_t FROBENIUS_COEFF_FQ2_C1 [1:0] = {
+      381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa,
+      381'h000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001};
+
+        
+        
+  // Generator points for G2
  fe2_t G2x = {381'h13e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e,
               381'h024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8};

@ -326,7 +366,7 @@ package bls12_381_pkg;

  endfunction

-   function jb_point_t point_mult(logic [DAT_BITS-1:0] c, jb_point_t p);
+   function jb_point_t point_mult(input logic [DAT_BITS-1:0] c, jb_point_t p);
     jb_point_t result, addend;
     result = 0;
     addend = p;
@ -340,7 +380,7 @@ package bls12_381_pkg;
     return result;
   endfunction

-   function fp2_jb_point_t fp2_point_mult(logic [DAT_BITS-1:0] c, fp2_jb_point_t p);
+   function fp2_jb_point_t fp2_point_mult(input logic [DAT_BITS-1:0] c, fp2_jb_point_t p);
     fp2_jb_point_t result, addend;
     result = 0;
     addend = p;
@ -425,6 +465,50 @@ package bls12_381_pkg;
     fe6_mul_by_nonresidue[2] = a[1];
     fe6_mul_by_nonresidue[0] = fe2_mul_by_nonresidue(a[2]);
   endfunction
+   
+   function fe6_t fe6_inv(fe6_t a);
+     fe2_t add_i0, add_i1, sub_i0, mul_i0;
+     fe6_inv[0] = fe2_mul_by_nonresidue(a[2]);
+     fe6_inv[0] = fe2_mul(fe6_inv[0], a[1]);
+     fe6_inv[0] = fe2_sub(0, fe6_inv[0]);
+     add_i0 =  fe2_mul(a[0], a[0]);
+     fe6_inv[0] = fe2_add(add_i0, fe6_inv[0]);
+     
+     fe6_inv[1] = fe2_mul(a[2], a[2]);
+     fe6_inv[1] = fe2_mul_by_nonresidue(fe6_inv[1]);
+     sub_i0 = fe2_mul(a[0], a[1]);
+     fe6_inv[1] = fe2_sub(fe6_inv[1], sub_i0);
+     
+     fe6_inv[2] = fe2_mul(a[1], a[1]);
+     sub_i0 = fe2_mul(a[2], a[0]);
+     fe6_inv[2] = fe2_sub(fe6_inv[2], sub_i0);
+     
+     add_i0 = fe2_mul(a[2], fe6_inv[1]);
+     add_i1 = fe2_mul(a[1], fe6_inv[2]);
+     add_i1 = fe2_add(add_i0, add_i1);
+     add_i1 = fe2_mul_by_nonresidue(add_i1);
+     add_i0 = fe2_mul(a[0], fe6_inv[0]);
+     add_i1 = fe2_add(add_i1, add_i0);
+     
+     mul_i0 = fe2_inv(add_i1);
+     
+     fe6_inv[0] = fe2_mul(fe6_inv[0], mul_i0);
+     fe6_inv[1] = fe2_mul(fe6_inv[1], mul_i0);
+     fe6_inv[2] = fe2_mul(fe6_inv[2], mul_i0);
+     
+   endfunction
+   
+   function fe12_t fe12_inv(fe12_t a);
+     fe12_t  sub_i0, sub_i1, mul_i0;
+     sub_i0 = fe6_mul(a[0], a[0]);
+     sub_i1 = fe6_mul(a[1], a[1]);
+     sub_i1 = fe6_mul_by_nonresidue(sub_i1);
+     sub_i0 = fe6_sub(sub_i0,sub_i1);
+     sub_i0 = fe6_inv(sub_i0);
+     fe12_inv[0] = fe6_mul(a[0], sub_i0);
+     fe12_inv[1] = fe6_mul(a[1], sub_i0);
+     fe12_inv[1] = fe6_sub(0, fe12_inv[1]);
+   endfunction

   function fe6_t fe6_add(fe6_t a, b);
     for(int i = 0; i < 3; i++)
@ -495,6 +579,20 @@ package bls12_381_pkg;
     bb = fe6_mul_by_nonresidue(bb); // 7. bb = mnr(bb) [6]
     fe12_mul[0] = fe6_add(bb, aa); // 8. fe6_mul[0] = add(add_i0, bb) [0, 1, 7]
   endfunction
+   
+   function fe12_t fe12_sqr(fe12_t a);
+     fe6_t sub_i1, mul_i0, mul_i1;
+     sub_i1 = fe6_mul(a[0], a[1]);  // 0. 
+     mul_i0 = fe6_add(a[0], a[1]);  // 1.   (wait eq0)
+     mul_i1 = fe6_mul_by_nonresidue(a[1]);
+     mul_i1 = fe6_add(mul_i1, mul_i1);
+     fe12_sqr[0] = fe6_mul(mul_i1, mul_i0);
+     fe12_sqr[0] = fe6_sub(fe12_sqr[0], sub_i1);
+     fe12_sqr[1] = fe2_add(sub_i1, sub_i1);
+     sub_i1 = fe6_mul_by_nonresidue(sub_i1);
+     fe12_sqr[0] = fe6_sub(fe12_sqr[0], sub_i1);
+ 
+   endfunction


   // This performs the miller loop
@ -520,7 +618,12 @@ package bls12_381_pkg;
    end

  endtask
-
+  
+  task automatic ate_pairing(input af_point_t P, input fp2_af_point_t Q, ref fe12_t f);
+    miller_loop(P, Q, f);    
+    final_exponent(f);
+  endtask;
+  
   // This performs both the line evaluation and the doubling
   // Returns a sparse f12 element
  task automatic miller_double_step(ref fp2_jb_point_t R, input af_point_t P, ref fe12_t f);
@ -656,30 +759,101 @@ package bls12_381_pkg;
     f = {{FE2_zero, t10, FE2_zero}, {FE2_zero, t1, t9}};

   endtask
-
-   // Calculates the final exponent used in ate pairing
-   /*task automatic final_exponent(ref fe12_t f);
-     f = fe12_sub(0, f); // TODO can remove this?
-
-
-   endtask*/
-
-   // Sparse multiplication by coefficients 0,1,4
-   function fe12_t f12_sparse_mul_014(fe12_t f, fe2_t c0, c1, c4);
-     fe6_t aa, bb;
-     fe2_t t;
-     aa = fe6_mul(f[0], {FE2_zero, c1, c0});      // TODO implement sparse fp6
-     bb = fe6_mul(f[1], {FE2_zero, c4, FE2_zero});  // TODO implement sparse fp6
-     t = fe2_add(c1, c4);
-     f[1] = fe6_add(f[1], f[0]);
-     f[1] = fe6_mul(f[1], {FE2_zero, t, c0});
-     f[1] = fe6_sub(f[1], aa);
-     f[1] = fe6_sub(f[1], bb);
-     f[0] = fe6_mul_by_nonresidue(bb);
-     f[0] = fe6_add(f[0], aa);
-
-     return f;
+ 
+   function fe2_t fe2_fmap(input fe2_t a, input int pow);
+     fe2_fmap[0] = a[0];
+     fe2_fmap[1] = fe_mul(a[1], FROBENIUS_COEFF_FQ2_C1[pow % 2]);
   endfunction
+     
+   function fe6_t fe6_fmap(input fe6_t a, input int pow);
+     fe6_fmap[0] = fe2_fmap(a[0], pow);
+     fe6_fmap[1] = fe2_fmap(a[1], pow);
+     fe6_fmap[2] = fe2_fmap(a[2], pow);
+     fe6_fmap[1] = fe2_mul(fe6_fmap[1], FROBENIUS_COEFF_FQ6_C1[pow % 6]);
+     fe6_fmap[2] = fe2_mul(fe6_fmap[2], FROBENIUS_COEFF_FQ6_C2[pow % 6]);
+   endfunction
+   
+   
+   function fe12_t fe12_fmap(input fe12_t a, input int pow);
+     fe12_fmap[0] = fe6_fmap(a[0], pow);
+     fe12_fmap[1] = fe6_fmap(a[1], pow);
+     fe12_fmap[1][0] = fe2_mul(fe12_fmap[1][0], FROBENIUS_COEFF_FQ12_C1[pow % 12]);
+     fe12_fmap[1][1] = fe2_mul(fe12_fmap[1][1], FROBENIUS_COEFF_FQ12_C1[pow % 12]);
+     fe12_fmap[1][2] = fe2_mul(fe12_fmap[1][2], FROBENIUS_COEFF_FQ12_C1[pow % 12]);
+   endfunction
+   
+   // Max size is 1024 bit number
+   function fe12_t fe12_pow(input fe12_t a, input logic [1023:0] pow);
+      fe12_pow = FE12_one;
+      
+      while (pow != 0) begin
+        if (pow[0])
+          fe12_pow = fe12_mul(fe12_pow, a);  
+        a = fe12_mul(a, a);
+        pow = pow >> 1;
+      end
+     
+     fe12_pow[1] = fe6_sub(0, fe12_pow[1]);
+   endfunction
+   
+   // Calculates the final exponent used in ate pairing
+   task automatic final_exponent(ref fe12_t f);
+     fe12_t mul_i1, y0, y1, y2, y3, r, r_inv;
+     logic [63:0] bls_x;
+     bls_x = ATE_X;
+     
+     r = f;
+     r[1] = fe6_sub(0, r[1]);
+     r_inv = fe12_inv(r); 
+     r = fe12_mul(f, r_inv);
+     
+     mul_i1 = fe12_fmap(r, 2);
+     
+     r = fe12_mul(mul_i1, r);
+     
+     y0 = fe12_mul(r, r);
+    
+     
+     y1 = fe12_pow(y0, bls_x);
+
+     
+     bls_x = bls_x >> 1;
+     y2 = fe12_pow(y1, bls_x);
+     bls_x = bls_x << 1;
+ 
+     y3 = r;
+     y3[1] = fe6_sub(0, y3[1]);
+     y1 = fe12_mul(y1, y3);
+     y1[1] = fe6_sub(0, y1[1]);
+     y1 = fe12_mul(y1, y2);
+     
+     y2 = fe12_pow(y1, bls_x);
+     y3 = fe12_pow(y2, bls_x);
+
+
+     
+     y1[1] = fe6_sub(0, y1[1]);
+     y3 = fe12_mul(y3, y1);
+     
+     y1[1] = fe6_sub(0, y1[1]);
+     y1 = fe12_fmap(y1, 3);
+     
+     y2 = fe12_fmap(y2, 2);
+     y1 = fe12_mul(y1, y2);
+     
+     y2 = fe12_pow(y3, bls_x);
+     
+     y2 = fe12_mul(y2, y0);
+     
+     y2 = fe12_mul(y2, r);
+     y1 = fe12_mul(y1, y2);
+     
+     y2 = fe12_fmap(y3, 1);
+     y1 = fe12_mul(y1, y2);
+     
+     f = y1;
+   endtask
+

   function af_point_t to_affine(jb_point_t p);
     fe_t z_;
--- a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
@ -0,0 +1,201 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module bls12_381_fmap_tb ();
+
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+parameter type FE_TYPE   = bls12_381_pkg::fe_t;
+parameter P              = bls12_381_pkg::P;
+
+
+localparam CTL_BITS = 32;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_i_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_i_if (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_i_if (clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_o_if (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_i_if (clk);
+
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) fmap_fe12_o_if (clk);
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) fmap_fe12_i_if (clk);
+
+ec_fp_mult_mod #(
+  .P             ( P        ),
+  .KARATSUBA_LVL ( 3        ),
+  .CTL_BITS      ( CTL_BITS )
+)
+ec_fp_mult_mod (
+  .i_clk( clk         ),
+  .i_rst( rst         ),
+  .i_mul ( mul_fe_o_if[2] ),
+  .o_mul ( mul_fe_i_if[2] )
+);
+
+adder_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+adder_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_add ( add_fe_o_if ),
+  .o_add ( add_fe_i_if )
+);
+
+subtractor_pipe # (
+  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .P        ( P        ),
+  .CTL_BITS ( CTL_BITS ),
+  .LEVEL    ( 2        )
+)
+subtractor_pipe (
+  .i_clk ( clk        ),
+  .i_rst ( rst        ),
+  .i_sub ( sub_fe_o_if ),
+  .o_sub ( sub_fe_i_if )
+);
+
+ec_fe2_mul #(
+  .FE_TYPE  ( FE_TYPE  ),
+  .CTL_BITS ( CTL_BITS )
+)
+ec_fe2_mul (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .o_mul_fe2_if ( mul_fe2_i_if ),
+  .i_mul_fe2_if ( mul_fe2_o_if ),
+  .o_add_fe_if ( add_fe_o_if ),
+  .i_add_fe_if ( add_fe_i_if ),
+  .o_sub_fe_if ( sub_fe_o_if),
+  .i_sub_fe_if ( sub_fe_i_if ),
+  .o_mul_fe_if ( mul_fe_o_if[1] ),
+  .i_mul_fe_if ( mul_fe_i_if[1] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( 4                ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe_mul (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_axi ( mul_fe_o_if[1:0] ),
+  .o_res ( mul_fe_o_if[2]   ),
+  .i_res ( mul_fe_i_if[2]   ),
+  .o_axi ( mul_fe_i_if[1:0] )
+);
+
+bls12_381_fe12_fmap_wrapper #(
+  .FE_TYPE     ( FE_TYPE  ),
+  .CTL_BITS    ( CTL_BITS ),
+  .CTL_BIT_POW ( 0        )
+)
+bls12_381_fe12_fmap_wrapper (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .o_fmap_fe12_if ( fmap_fe12_o_if ),
+  .i_fmap_fe12_if ( fmap_fe12_i_if ),
+  .o_mul_fe2_if ( mul_fe2_o_if ),
+  .i_mul_fe2_if ( mul_fe2_i_if ),
+  .o_mul_fe_if ( mul_fe_o_if[0] ),
+  .i_mul_fe_if ( mul_fe_i_if[0] )
+);
+
+task test();
+  fe12_t f, f_exp, f_out;
+  integer signed get_len;
+  integer pow;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat, dat_in;
+
+  $display("Running test ...");
+  dat_in = 0;
+  
+  for (int pow = 0; pow < 4; pow++) begin
+    for (int i = 0; i < 2; i++)
+      for (int j = 0; j < 3; j++)
+        for (int k = 0; k < 2; k++) begin
+          dat_in[(i*6+j*2+k)*384 +: $bits(FE_TYPE)] = random_vector(384/8) % P;
+          f[i][j][k] = dat_in[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
+        end
+
+    f_exp = fe12_fmap(f, pow);
+  
+    fork
+      fmap_fe12_i_if.put_stream(dat_in, 12*384/8, pow);
+      fmap_fe12_o_if.get_stream(get_dat, get_len, 0);
+    join
+  
+    for (int i = 0; i < 2; i++)
+      for (int j = 0; j < 3; j++)
+        for (int k = 0; k < 2; k++)
+          f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
+  
+    if (f_exp != f_out) begin
+      $display("Input  was:");
+      print_fe12(f);  
+      $display("Output  was:");
+      print_fe12(f_out);
+      $display("Output Expected:");
+      print_fe12(f_exp);
+      $fatal(1, "%m %t ERROR: output was wrong", $time);
+    end
+    $display("test OK with pow=%d", pow);
+  end
+  $display("test PASSED");
+
+endtask
+
+
+
+initial begin
+  fmap_fe12_i_if.reset_source();
+  fmap_fe12_o_if.rdy = 0;
+  #10ns;
+
+  test();
+
+  #50ns $finish();
+end
+
+endmodule