First cut of pairing engine for bls12-381

2019-07-24 21:37:37 +08:00 · 2019-07-24 21:37:37 +08:00 · 2044d52db1
parent 73037d3612
commit 2044d52db1
5 changed files with 1380 additions and 12 deletions
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
@ -0,0 +1,295 @@
+/*
+  This is the top level for the bls12-381 pairing engine.
+  It performs both the miller loop and final exponentiation required for ate pairing (G2 x G1).
+  Inputs are points in G1 and G2 (affine coordinates)
+  Output is a Fp12 element.
+
+  TODO: Replace multiplications in fe12 with spare versions.
+  TODO: Implement squaring functions.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_pairing
+  import bls12_381_pkg::*;
+#(
+  parameter type FE_TYPE = fe_t,
+  parameter type FE2_TYPE = fe2_t,
+  parameter type FE12_TYPE = fe12_t,
+  parameter type G1_FP_AF_TYPE = af_point_t,
+  parameter type G2_FP_AF_TYPE = fp2_af_point_t,
+  parameter type G2_FP_JB_TYPE = fp2_jb_point_t,
+  parameter CTL_BITS = 32,
+  parameter OVR_WRT_BIT = 8 // We override 16 bits from here
+)(
+  input i_clk, i_rst,
+  // Inputs
+  input               i_val,
+  output logic        o_rdy,
+  input G1_FP_AF_TYPE i_g1_af,
+  input G2_FP_AF_TYPE i_g2_af,
+  // Outputs
+  output logic     o_val,
+  input            i_rdy,
+  output FE12_TYPE o_fe12,
+  // Interface to FE_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if,
+  // Interface to FE2_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE2_TYPE adder (mod P)
+  if_axi_stream.source o_add_fe2_if,
+  if_axi_stream.sink   i_add_fe2_if,
+  // Interface to FE2_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_fe2_if,
+  if_axi_stream.sink   i_sub_fe2_if,
+  // Interface to FE12_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe12_if,
+  if_axi_stream.sink   i_mul_fe12_if
+);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_i_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_o_if [2:0] (clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_i_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe2_o_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) add_fe2_i_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS))   add_fe2_o_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe2_i_if [2:0] (clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS))   sub_fe2_o_if [2:0] (clk);
+
+
+logic dbl_i_val, dbl_o_rdy, dbl_o_val, dbl_i_rdy, dbl_o_err;
+logic add_i_val, add_o_rdy, add_o_val, add_i_rdy, add_o_err;
+
+logic wait_dbl, wait_add;
+
+G1_FP_AF_TYPE g1_af_i;
+G2_FP_JB_TYPE g2_r_jb_i, add_g2_o, dbl_g2_o;
+G2_FP_AF_TYPE g2_af_i;
+FE12_TYPE add_f12_o, dbl_f12_o;
+logic [$clog2(ATE_X_START)-1:0] ate_loop_cnt;
+
+enum {IDLE, MILLER_LOOP, FINAL_EXP} pair_state;
+
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_fe12 <= 0;
+    g1_af_i <= 0;
+    g2_r_jb_i <= 0;
+    i_mul_fe12_if.rdy <= 0;
+    o_mul_fe12_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    pair_state <= IDLE;
+    add_i_val <= 0;
+    dbl_i_val <= 0;
+    add_i_rdy <= 0;
+    dbl_i_rdy <= 0;
+    o_rdy <= 0;
+    wait_dbl <= 0;
+    wait_add <= 0;
+    ate_loop_cnt <= ATE_X_START;
+  end else begin
+
+    if (i_rdy && o_val) o_val <= 0;
+    if (add_i_val && add_o_rdy) add_i_val <= 0;
+    if (dbl_i_val && dbl_o_rdy) dbl_i_val <= 0;
+    if (o_mul_fe12_if.val && o_mul_fe12_if.rdy) o_mul_fe12_if.val <= 0;
+
+    i_mul_fe12_if.rdy <= 1;
+
+    case(pair_state)
+      IDLE: begin
+        ate_loop_cnt <= ATE_X_START;
+        o_fe12 <= 0;
+        o_rdy <= 1;
+        add_i_val <= 0;
+        dbl_i_val <= 0;
+        add_i_rdy <= 0;
+        dbl_i_rdy <= 0;
+        wait_dbl <= 0;
+        wait_add <= 0;
+        if (i_val && o_rdy) begin
+          pair_state <= MILLER_LOOP;
+          o_rdy <= 0;
+
+          g1_af_i <= i_g1_af;
+          g2_af_i <= i_g2_af;
+
+          g2_r_jb_i.x <= i_g2_af.x;
+          g2_r_jb_i.y <= i_g2_af.y;
+          g2_r_jb_i.z <= 1;
+        end
+      end
+      MILLER_LOOP: begin
+        if (~wait_dbl) begin
+          dbl_i_val <= 1;
+
+        end
+
+        if (ATE_X[ate_loop_cnt] == 1) begin
+          // Do add step in here as well
+
+        end
+
+        // Also three multiplications
+
+
+        add_i_rdy <= 0;
+        dbl_i_rdy <= 0;
+
+
+      end
+      FINAL_EXP: begin
+
+      end
+    endcase
+
+  end
+end
+
+bls12_381_pairing_miller_dbl #(
+  .FE_TYPE       ( FE_TYPE       ),
+  .FE2_TYPE      ( FE2_TYPE      ),
+  .FE12_TYPE     ( FE12_TYPE     ),
+  .G1_FP_AF_TYPE ( G1_FP_AF_TYPE ),
+  .G2_FP_JB_TYPE ( G2_FP_JB_TYPE ),
+  .OVR_WRT_BIT   ( OVR_WRT_BIT   )
+)
+bls12_381_pairing_miller_dbl (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_val         ( dbl_i_val ),
+  .o_rdy         ( dbl_o_rdy ),
+  .i_g1_af       ( g1_af_i   ),
+  .i_g2_jb       ( g2_r_jb_i ),
+  .o_val         ( dbl_o_val ),
+  .i_rdy         ( dbl_i_rdy ),
+  .o_err         ( dbl_o_err ),
+  .o_res_fe12    ( dbl_f12_o ),
+  .o_g2_jb       ( dbl_g2_o ),
+  .o_mul_fe2_if ( mul_fe2_i_if[0] ),
+  .i_mul_fe2_if ( mul_fe2_o_if[0] ),
+  .o_add_fe2_if ( add_fe2_i_if[0] ),
+  .i_add_fe2_if ( add_fe2_o_if[0] ),
+  .o_sub_fe2_if ( sub_fe2_i_if[0] ),
+  .i_sub_fe2_if ( sub_fe2_i_if[0] ),
+  .o_mul_fe_if ( mul_fe_i_if[0] ),
+  .i_mul_fe_if ( mul_fe_i_if[0] )
+);
+
+bls12_381_pairing_miller_add #(
+  .FE_TYPE       ( FE_TYPE       ),
+  .FE2_TYPE      ( FE2_TYPE      ),
+  .FE12_TYPE     ( FE12_TYPE     ),
+  .G1_FP_AF_TYPE ( G1_FP_AF_TYPE ),
+  .G2_FP_JB_TYPE ( G2_FP_JB_TYPE ),
+  .G2_FP_AF_TYPE ( G2_FP_AF_TYPE ),
+  .OVR_WRT_BIT   ( OVR_WRT_BIT   )
+)
+bls12_381_pairing_miller_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_val         ( add_i_val ),
+  .o_rdy         ( add_o_rdy ),
+  .i_g1_af       ( g1_af_i   ),
+  .i_g2_jb       ( g2_r_jb_i ),
+  .i_g2_q_af     ( g2_af_i   ),
+  .o_val         ( add_o_val ),
+  .i_rdy         ( add_i_rdy ),
+  .o_err         ( add_o_err ),
+  .o_res_fe12    ( add_f12_o ),
+  .o_g2_jb       ( add_g2_o ),
+  .o_mul_fe2_if ( mul_fe2_i_if[1] ),
+  .i_mul_fe2_if ( mul_fe2_o_if[1] ),
+  .o_add_fe2_if ( add_fe2_i_if[1] ),
+  .i_add_fe2_if ( add_fe2_o_if[1] ),
+  .o_sub_fe2_if ( sub_fe2_i_if[1] ),
+  .i_sub_fe2_if ( sub_fe2_i_if[1] ),
+  .o_mul_fe_if ( mul_fe_i_if[1] ),
+  .i_mul_fe_if ( mul_fe_i_if[1] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 8  ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe_mul (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mul_fe_i_if[1:0] ),
+  .o_res ( mul_fe_i_if[2]   ),
+  .i_res ( mul_fe_o_if[2]   ),
+  .o_axi ( mul_fe_o_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE2_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 8  ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe2_mul (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( mul_fe2_i_if[1:0] ),
+  .o_res ( mul_fe2_i_if[2]   ),
+  .i_res ( mul_fe2_o_if[2]   ),
+  .o_axi ( mul_fe2_o_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE2_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 8  ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe2_add (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( add_fe2_i_if[1:0] ),
+  .o_res ( add_fe2_i_if[2]   ),
+  .i_res ( add_fe2_o_if[2]   ),
+  .o_axi ( add_fe2_o_if[1:0] )
+);
+
+resource_share # (
+  .NUM_IN       ( 2                ),
+  .DAT_BITS     ( 2*$bits(FE2_TYPE) ),
+  .CTL_BITS     ( CTL_BITS         ),
+  .OVR_WRT_BIT  ( OVR_WRT_BIT + 8  ),
+  .PIPELINE_IN  ( 0                ),
+  .PIPELINE_OUT ( 0                )
+)
+resource_share_fe2_sub (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_axi ( sub_fe2_i_if[1:0] ),
+  .o_res ( sub_fe2_i_if[2]   ),
+  .i_res ( sub_fe2_o_if[2]   ),
+  .o_axi ( sub_fe2_o_if[1:0] )
+);
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_miller_add.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_miller_add.sv
@ -0,0 +1,382 @@
+/*
+  This performs the line evaluation and add required for the miller loop
+  in the ate pairing.
+
+  Inputs are points in G1 (Fp affine), G2 (Fp2 jacobian), G2_Q (Fp2 affine)
+  The output is a sparse Fe12.
+
+  Equations are mapped to bls12_381_pkg::miller_add_step()
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_pairing_miller_add
+#(
+  parameter type FE_TYPE,
+  parameter type FE2_TYPE,
+  parameter type FE12_TYPE,
+  parameter type G1_FP_AF_TYPE,
+  parameter type G2_FP_JB_TYPE,
+  parameter type G2_FP_AF_TYPE,
+  parameter OVR_WRT_BIT = 8 // Require 6 bits from this for control
+)(
+  input i_clk, i_rst,
+  // Inputs
+  input               i_val,
+  output logic        o_rdy,
+  input G1_FP_AF_TYPE i_g1_af,
+  input G2_FP_JB_TYPE i_g2_jb,
+  input G2_FP_AF_TYPE i_g2_q_af,
+  // Result is sparse Fe12 and added G2 point
+  output logic         o_val,
+  input                i_rdy,
+  output logic         o_err,
+  output FE12_TYPE     o_res_fe12,
+  output G2_FP_JB_TYPE o_g2_jb,
+  // Interface to FE2_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE2_TYPE adder (mod P)
+  if_axi_stream.source o_add_fe2_if,
+  if_axi_stream.sink   i_add_fe2_if,
+  // Interface to FE2_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_fe2_if,
+  if_axi_stream.sink   i_sub_fe2_if,
+  // Interface to FE_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+localparam NUM_OVR_WRT_BIT = 6;
+
+logic [42:0] eq_val, eq_wait;
+FE2_TYPE zsquared, ysquared;
+FE2_TYPE [10:0] t;
+logic o_rdy_l;
+
+always_comb begin
+  o_res_fe12 = {$bits(FE2_TYPE)'(0), t[10], $bits(FE2_TYPE)'(0), $bits(FE2_TYPE)'(0), t[1], t[9]};
+  o_val = eq_val[39] && eq_val[40] && eq_val[41] && eq_val[36] && eq_val[42];
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_mul_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_add_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_sub_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_mul_fe_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_g2_jb <= 0;
+    t <= 0;
+    zsquared <= 0;
+    ysquared <= 0;
+    
+    i_mul_fe2_if.rdy <= 0;
+    i_add_fe2_if.rdy <= 0;
+    i_sub_fe2_if.rdy <= 0;
+    i_mul_fe_if.rdy <= 0;
+
+    eq_val <= 0;
+    eq_wait <= 0;
+    o_rdy <= 0;
+    o_rdy_l <= 0;
+    o_err <= 0;
+  end else begin
+
+    i_mul_fe2_if.rdy <= 1;
+    i_add_fe2_if.rdy <= 1;
+    i_sub_fe2_if.rdy <= 1;
+    i_mul_fe_if.rdy <= 1;
+
+    if (o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+    if (o_add_fe2_if.rdy) o_add_fe2_if.val <= 0;
+    if (o_sub_fe2_if.rdy) o_sub_fe2_if.val <= 0;
+    if (o_mul_fe_if.rdy) o_mul_fe_if.val <= 0;
+    if (i_val && o_rdy) o_rdy <= 0;
+
+    if (o_val && i_rdy) begin
+      eq_val <= 0;
+      eq_wait <= 0;
+      t <= 0;
+      zsquared <= 0;
+      ysquared <= 0;
+      o_rdy_l <= 0;
+    end
+
+    if (eq_wait[39] && eq_wait[40] && eq_wait[41] && eq_wait[42] && ~o_rdy_l) begin
+       o_rdy <= 1;
+       o_rdy_l <= 1;
+    end
+
+    // Check any results from multiplier
+    if (i_mul_fe2_if.val && i_mul_fe2_if.rdy) begin
+      eq_val[i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        0: zsquared <= i_mul_fe2_if.dat;
+        1: ysquared <= i_mul_fe2_if.dat;
+        2: t[0] <= i_mul_fe2_if.dat;
+        4: t[1] <= i_mul_fe2_if.dat;
+        7: t[1] <= i_mul_fe2_if.dat;
+        9: t[3] <= i_mul_fe2_if.dat;
+        10: t[4] <= i_mul_fe2_if.dat;
+        11: t[5] <= i_mul_fe2_if.dat;
+        14: t[9] <= i_mul_fe2_if.dat;
+        15: t[7] <= i_mul_fe2_if.dat;
+        16: o_g2_jb.x <= i_mul_fe2_if.dat;
+        21: o_g2_jb.z <= i_mul_fe2_if.dat;
+        24: zsquared <= i_mul_fe2_if.dat;
+        27: t[8] <= i_mul_fe2_if.dat;
+        28: t[0] <= i_mul_fe2_if.dat;
+        31: t[10] <= i_mul_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from sub
+    if (i_sub_fe2_if.val && i_sub_fe2_if.rdy) begin
+      eq_val[i_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        5: t[1] <= i_sub_fe2_if.dat;
+        6: t[1] <= i_sub_fe2_if.dat;
+        8: t[2] <= i_sub_fe2_if.dat;
+        12: t[6] <= i_sub_fe2_if.dat;
+        13: t[6] <= i_sub_fe2_if.dat;
+        17: o_g2_jb.x <= i_sub_fe2_if.dat;
+        18: o_g2_jb.x <= i_sub_fe2_if.dat;
+        19: o_g2_jb.x <= i_sub_fe2_if.dat;
+        22: o_g2_jb.z <= i_sub_fe2_if.dat;
+        23: o_g2_jb.z <= i_sub_fe2_if.dat;
+        26: t[8] <= i_sub_fe2_if.dat;
+        30: o_g2_jb.y <= i_sub_fe2_if.dat;
+        32: t[10] <= i_sub_fe2_if.dat;
+        33: t[10] <= i_sub_fe2_if.dat;
+        35: t[9] <= i_sub_fe2_if.dat;
+        37: t[6] <= i_sub_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from add
+    if (i_add_fe2_if.val && i_add_fe2_if.rdy) begin
+      eq_val[i_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        3: t[1] <= i_add_fe2_if.dat;
+        20: o_g2_jb.z <= i_add_fe2_if.dat;
+        25: t[10] <= i_add_fe2_if.dat;
+        29: t[0] <= i_add_fe2_if.dat;
+        34: t[9] <= i_add_fe2_if.dat;
+        36: t[10] <= i_add_fe2_if.dat;
+        38: t[1] <= i_add_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from fe multiplier
+    if (i_mul_fe_if.val && i_mul_fe_if.rdy) begin
+      eq_val[i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        39: t[10][0] <= i_mul_fe_if.dat;
+        40: t[10][1] <= i_mul_fe_if.dat;
+        41: t[1][0] <= i_mul_fe_if.dat;
+        42: t[1][1] <= i_mul_fe_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Issue new multiplies
+    if (~eq_wait[0] && i_val) begin
+      fe2_multiply(0, i_g2_jb.z, i_g2_jb.z);
+    end else
+    if (~eq_wait[1] && i_val) begin
+      fe2_multiply(1, i_g2_q_af.y, i_g2_q_af.y);
+    end else
+    if (~eq_wait[2] && eq_val[0]) begin
+      fe2_multiply(2, zsquared, i_g2_q_af.x);
+    end else
+    if (~eq_wait[4] && eq_val[3]) begin
+      fe2_multiply(4, t[1], t[1]);
+    end else
+    if (~eq_wait[7] && eq_val[6]) begin
+      fe2_multiply(7, t[1], zsquared);
+    end else
+    if (~eq_wait[9] && eq_val[8]) begin
+      fe2_multiply(9, t[2], t[2]);
+    end else
+    if (~eq_wait[10] && eq_val[9]) begin
+      fe2_multiply(10, t[3], 10);
+    end else
+    if (~eq_wait[11] && eq_val[8] && eq_val[10]) begin
+      fe2_multiply(11, t[2], t[4]);
+    end else
+    if (~eq_wait[14] && eq_val[13]) begin
+      fe2_multiply(14, t[6], i_g2_q_af.x);
+    end else
+    if (~eq_wait[15] && eq_val[10]) begin
+      fe2_multiply(15, t[4], i_g2_jb.x);
+    end else
+    if (~eq_wait[16] && eq_val[13]) begin
+      fe2_multiply(16, t[6], t[6]);
+    end else
+    if (~eq_wait[21] && eq_val[20]) begin
+      fe2_multiply(21, o_g2_jb.z, o_g2_jb.z);
+    end else
+    if (~eq_wait[24] && eq_val[23]) begin
+      fe2_multiply(24, o_g2_jb.z, o_g2_jb.z);
+    end else
+    if (~eq_wait[27] && eq_val[26] && eq_val[13]) begin
+      fe2_multiply(27, t[8], t[6]);
+    end else
+    if (~eq_wait[28] && eq_val[11]) begin
+      fe2_multiply(28, i_g2_jb.y, t[5]);
+    end else
+    if (~eq_wait[31] && eq_val[23]) begin
+      fe2_multiply(31, t[10], t[10]);
+    end
+
+    // Issue new adds
+    if (~eq_wait[3] && i_val) begin
+      fe2_addition(3, i_g2_jb.z, i_g2_q_af.y);
+    end else
+    if (~eq_wait[20] && eq_val[8]) begin
+      fe2_addition(20, i_g2_jb.z, t[2]);
+    end else
+    if (~eq_wait[25] && eq_val[23]) begin
+      fe2_addition(25, o_g2_jb.z, i_g2_q_af.y);
+    end else
+    if (~eq_wait[29] && eq_val[28]) begin
+      fe2_addition(29, t[0], t[0]);
+    end else
+    if (~eq_wait[34] && eq_val[14]) begin
+      fe2_addition(34, t[9], t[9]);
+    end else
+    if (~eq_wait[36] && eq_val[23] && eq_wait[35]) begin
+      fe2_addition(36, o_g2_jb.z, o_g2_jb.z);
+    end else
+    if (~eq_wait[38] && eq_val[37]) begin
+      fe2_addition(38, t[6], t[6]);
+    end
+
+    // Issue new sub
+    if (~eq_wait[5] && eq_val[4] && eq_val[1]) begin
+      fe2_subtraction(5, t[1], ysquared);
+    end else
+    if (~eq_wait[6] && eq_val[5] && eq_val[0]) begin
+      fe2_subtraction(6, t[1], zsquared);
+    end else
+    if (~eq_wait[8] && eq_val[2] && i_val) begin
+      fe2_subtraction(8, t[0], i_g2_jb.x);
+    end else
+    if (~eq_wait[12] && eq_val[3]) begin
+      fe2_subtraction(12, t[1], i_g2_jb.y);
+    end else
+    if (~eq_wait[13] && eq_val[12]) begin
+      fe2_subtraction(13, t[6], i_g2_jb.y);
+    end else
+    if (~eq_wait[17] && eq_val[11] && eq_val[16]) begin
+      fe2_subtraction(17, o_g2_jb.x, t[5]);
+    end else
+    if (~eq_wait[18] && eq_val[17] && eq_val[10]) begin
+      fe2_subtraction(18, o_g2_jb.x, t[7]);
+    end else
+    if (~eq_wait[19] && eq_val[18] && eq_val[15]) begin
+      fe2_subtraction(19, o_g2_jb.x, t[7]);
+    end else
+    if (~eq_wait[22] && eq_val[21] && eq_val[0]) begin
+      fe2_subtraction(22, o_g2_jb.z, zsquared);
+    end else
+    if (~eq_wait[23] && eq_val[22] && eq_val[9]) begin
+      fe2_subtraction(23, o_g2_jb.z, t[3]);
+    end else
+    if (~eq_wait[26] && eq_val[19] && eq_val[15]) begin
+      fe2_subtraction(26, t[7], o_g2_jb.x);
+    end else
+    if (~eq_wait[30] && eq_val[29] && eq_val[27]) begin
+      fe2_subtraction(30, t[8], t[0]);
+    end else
+    if (~eq_wait[32] && eq_val[31] && eq_val[1]) begin
+      fe2_subtraction(32, t[10], ysquared);
+    end else
+    if (~eq_wait[33] && eq_val[32] && eq_val[24]) begin
+      fe2_subtraction(33, t[10], zsquared);
+    end else
+    if (~eq_wait[35] && eq_val[34] && eq_val[33]) begin
+      fe2_subtraction(35, t[9], t[10]);
+    end else
+    if (~eq_wait[37] && eq_wait[27]) begin
+      fe2_subtraction(37, 0, t[6]);
+    end
+
+    // Issue final fe multiplications
+    if (~eq_wait[39] && eq_val[36]) begin
+      fe_multiply(39, t[10][0], i_g1_af.y);
+    end else
+    if (~eq_wait[40] && eq_val[36]) begin
+      fe_multiply(40, t[10][1], i_g1_af.y);
+    end else
+    if (~eq_wait[41] && eq_val[38]) begin
+      fe_multiply(41, t[1][0], i_g1_af.x);
+    end else
+    if (~eq_wait[42] && eq_val[38]) begin
+      fe_multiply(42, t[1][1], i_g1_af.x);
+    end
+
+  end
+end
+
+// Task for subtractions
+task fe2_subtraction(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_sub_fe2_if.val || (o_sub_fe2_if.val && o_sub_fe2_if.rdy)) begin
+    o_sub_fe2_if.val <= 1;
+    o_sub_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_sub_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for addition
+task fe2_addition(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_add_fe2_if.val || (o_add_fe2_if.val && o_add_fe2_if.rdy)) begin
+    o_add_fe2_if.val <= 1;
+    o_add_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_add_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for using mult
+task fe2_multiply(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+    o_mul_fe2_if.val <= 1;
+    o_mul_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for using mult (fe)
+task fe_multiply(input int unsigned ctl, input FE_TYPE a, b);
+  if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+    o_mul_fe_if.val <= 1;
+    o_mul_fe_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_mul_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
+    o_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_miller_dbl.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_miller_dbl.sv
@ -0,0 +1,355 @@
+/*
+  This performs the line evaluation and doubling required for the miller loop
+  in the ate pairing.
+
+  Inputs are points in G1 (Fp affine), G2 (Fp2 jacobian)
+  The output is a sparse Fe12.
+
+  Equations are mapped to bls12_381_pkg::miller_double_step()
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+module bls12_381_pairing_miller_dbl
+#(
+  parameter type FE_TYPE,
+  parameter type FE2_TYPE,
+  parameter type FE12_TYPE,
+  parameter type G1_FP_AF_TYPE,
+  parameter type G2_FP_JB_TYPE,
+  parameter OVR_WRT_BIT = 8 // Require 6 bits from this for control
+)(
+  input i_clk, i_rst,
+  // Inputs
+  input               i_val,
+  output logic        o_rdy,
+  input G1_FP_AF_TYPE i_g1_af,
+  input G2_FP_JB_TYPE i_g2_jb,
+  // Result is sparse Fe12 and doubled G2 point
+  output logic         o_val,
+  input                i_rdy,
+  output logic         o_err,
+  output FE12_TYPE     o_res_fe12,
+  output G2_FP_JB_TYPE o_g2_jb,
+  // Interface to FE2_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe2_if,
+  if_axi_stream.sink   i_mul_fe2_if,
+  // Interface to FE2_TYPE adder (mod P)
+  if_axi_stream.source o_add_fe2_if,
+  if_axi_stream.sink   i_add_fe2_if,
+  // Interface to FE2_TYPE subtractor (mod P)
+  if_axi_stream.source o_sub_fe2_if,
+  if_axi_stream.sink   i_sub_fe2_if,
+  // Interface to FE_TYPE multiplier (mod P)
+  if_axi_stream.source o_mul_fe_if,
+  if_axi_stream.sink   i_mul_fe_if
+);
+
+localparam NUM_OVR_WRT_BIT = 6;
+
+logic [36:0] eq_val, eq_wait;
+FE2_TYPE zsquared;
+FE2_TYPE [6:0] t;
+logic o_rdy_l;
+
+always_comb begin
+  o_res_fe12 = {$bits(FE2_TYPE)'(0), t[0], $bits(FE2_TYPE)'(0), $bits(FE2_TYPE)'(0), t[3], t[6]};
+  o_val = eq_val[33] && eq_val[34] && eq_val[35] && eq_val[36] && eq_val[30];
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    o_mul_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_add_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_sub_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_mul_fe_if.copy_if(0, 0, 1, 1, 0, 0, 0);
+    o_g2_jb <= 0;
+    o_err <= 0;
+
+    t <= 0;
+    zsquared <= 0;
+
+    i_mul_fe2_if.rdy <= 0;
+    i_add_fe2_if.rdy <= 0;
+    i_sub_fe2_if.rdy <= 0;
+    i_mul_fe_if.rdy <= 0;
+
+    eq_val <= 0;
+    eq_wait <= 0;
+    o_rdy <= 0;
+    o_rdy_l <= 0;
+  end else begin
+
+    i_mul_fe2_if.rdy <= 1;
+    i_add_fe2_if.rdy <= 1;
+    i_sub_fe2_if.rdy <= 1;
+    i_mul_fe_if.rdy <= 1;
+
+    if (o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0;
+    if (o_add_fe2_if.rdy) o_add_fe2_if.val <= 0;
+    if (o_sub_fe2_if.rdy) o_sub_fe2_if.val <= 0;
+    if (o_mul_fe_if.rdy) o_mul_fe_if.val <= 0;
+    if (i_val && o_rdy) o_rdy <= 0;
+
+    if (o_val && i_rdy) begin
+      eq_val <= 0;
+      eq_wait <= 0;
+      t <= 0;
+      zsquared <= 0;
+      o_rdy_l <= 0;
+    end
+
+    if (eq_wait[33] && eq_wait[33] && eq_wait[33] && eq_wait[33] && ~o_rdy_l) begin
+       o_rdy <= 1;
+       o_rdy_l <= 1;
+    end
+
+    // Check any results from multiplier
+    if (i_mul_fe2_if.val && i_mul_fe2_if.rdy) begin
+      eq_val[i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        0: zsquared <= i_mul_fe2_if.dat;
+        1: t[0] <= i_mul_fe2_if.dat;
+        4: t[1] <= i_mul_fe2_if.dat;
+        5: t[2] <= i_mul_fe2_if.dat;
+        7: t[3] <= i_mul_fe2_if.dat;
+        12: t[5] <= i_mul_fe2_if.dat;
+        16: o_g2_jb.z <= i_mul_fe2_if.dat;
+        20: o_g2_jb.y <= i_mul_fe2_if.dat;
+        21: t[2] <= i_mul_fe2_if.dat;
+        23: t[3] <= i_mul_fe2_if.dat;
+        26: t[6] <= i_mul_fe2_if.dat;
+        29: t[1] <= i_mul_fe2_if.dat;
+        31: t[0] <= i_mul_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from sub
+    if (i_sub_fe2_if.val && i_sub_fe2_if.rdy) begin
+      eq_val[i_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        8:  t[3] <= i_sub_fe2_if.dat;
+        9: t[3] <= i_sub_fe2_if.dat;
+        13: o_g2_jb.x <= i_sub_fe2_if.dat;
+        14: o_g2_jb.x <= i_sub_fe2_if.dat;
+        17: o_g2_jb.z <= i_sub_fe2_if.dat;
+        18: o_g2_jb.z <= i_sub_fe2_if.dat;
+        19: o_g2_jb.y <= i_sub_fe2_if.dat;
+        22: o_g2_jb.y <= i_sub_fe2_if.dat;
+        25: t[3] <= i_sub_fe2_if.dat;
+        27: t[6] <= i_sub_fe2_if.dat;
+        28: t[6] <= i_sub_fe2_if.dat;
+        30: t[6] <= i_sub_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from add
+    if (i_add_fe2_if.val && i_add_fe2_if.rdy) begin
+      eq_val[i_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        2: t[5] <= i_add_fe2_if.dat;
+        3: t[5] <= i_add_fe2_if.dat;
+        6: t[3] <= i_add_fe2_if.dat;
+        10: t[3] <= i_add_fe2_if.dat;
+        11: t[6] <= i_add_fe2_if.dat;
+        15: o_g2_jb.z <= i_add_fe2_if.dat;
+        24: t[3] <= i_add_fe2_if.dat;
+        32: t[0] <= i_add_fe2_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Check any results from fe multiplier
+    if (i_mul_fe_if.val && i_mul_fe_if.rdy) begin
+      eq_val[i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]] <= 1;
+      case(i_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT]) inside
+        33: t[0][0] <= i_mul_fe_if.dat;
+        34: t[0][1] <= i_mul_fe_if.dat;
+        35: t[3][0] <= i_mul_fe_if.dat;
+        36: t[3][1] <= i_mul_fe_if.dat;
+        default: o_err <= 1;
+      endcase
+    end
+
+    // Issue new multiplies
+    if (~eq_wait[0] && i_val) begin
+      fe2_multiply(0, i_g2_jb.z, i_g2_jb.z);
+    end else
+    if (~eq_wait[1] && i_val) begin
+      fe2_multiply(1, i_g2_jb.x, i_g2_jb.x);
+    end else
+    if (~eq_wait[4] && i_val) begin
+      fe2_multiply(4, i_g2_jb.y, i_g2_jb.y);
+    end else
+    if (~eq_wait[5] && eq_val[4]) begin
+      fe2_multiply(5, t[1], t[1]);
+    end else
+    if (~eq_wait[7] && eq_val[6]) begin
+      fe2_multiply(7, t[3], t[3]);
+    end else
+    if (~eq_wait[12] && eq_val[3]) begin
+      fe2_multiply(12, t[5], t[5]);
+    end else
+    if (~eq_wait[16] && eq_val[15]) begin
+      fe2_multiply(16, o_g2_jb.z, o_g2_jb.z);
+    end else
+    if (~eq_wait[20] && eq_val[19] && eq_val[2]) begin
+      fe2_multiply(20, o_g2_jb.y, t[5]);
+    end else
+    if (~eq_wait[21] && eq_wait[9]) begin
+      fe2_multiply(21, 8, t[2]);
+    end else
+    if (~eq_wait[23] && eq_val[0] && eq_val[2] && eq_wait[14]) begin
+      fe2_multiply(23, t[5], zsquared);
+    end else
+    if (~eq_wait[26] && eq_val[11]) begin
+      fe2_multiply(26, t[6], t[6]);
+    end else
+    if (~eq_wait[29] && eq_wait[17] && eq_val[4] && eq_wait[5] && eq_wait[6]) begin
+      fe2_multiply(29, 4, t[1]);
+    end else
+    if (~eq_wait[31] && eq_val[0]) begin
+      fe2_multiply(31, o_g2_jb.z, zsquared);
+    end
+
+    // Issue new adds
+    if (~eq_wait[2] && eq_val[1]) begin
+      fe2_addition(2, t[0], t[0]);
+    end else
+    if (~eq_wait[3] && eq_val[2]) begin
+      fe2_addition(3, t[5], t[0]);
+    end else
+    if (~eq_wait[6] && eq_val[4]) begin
+      fe2_addition(6, i_g2_jb.x, t[1]);
+    end else
+    if (~eq_wait[10] && eq_val[9]) begin
+      fe2_addition(10, t[3], t[3]);
+    end else
+    if (~eq_wait[11] && eq_val[3]) begin
+      fe2_addition(11, i_g2_jb.x, t[5]);
+    end else
+    if (~eq_wait[15] && i_val) begin
+      fe2_addition(15, i_g2_jb.z, i_g2_jb.y);
+    end else
+    if (~eq_wait[24] && eq_val[23]) begin
+      fe2_addition(24, t[3], t[3]);
+    end else
+    if (~eq_wait[32] && eq_val[31]) begin
+      fe2_addition(32, t[0], t[0]);
+    end
+
+    // Issue new sub
+    if (~eq_wait[8] && eq_val[7] && eq_val[1]) begin
+      fe2_subtraction(8, t[3], t[0]);
+    end else
+    if (~eq_wait[9] && eq_val[8] && eq_val[5]) begin
+      fe2_subtraction(9, t[3], t[2]);
+    end else
+    if (~eq_wait[13] && eq_val[12] && eq_val[10]) begin
+      fe2_subtraction(13, t[5], t[3]);
+    end else
+    if (~eq_wait[14] && eq_val[13]) begin
+      fe2_subtraction(14, o_g2_jb.x, t[3]);
+    end else
+    if (~eq_wait[17] && eq_val[15] && eq_val[4]) begin
+      fe2_subtraction(17, o_g2_jb.z, t[1]);
+    end else
+    if (~eq_wait[18] && eq_val[17] && eq_val[0]) begin
+      fe2_subtraction(18, o_g2_jb.z, zsquared);
+    end else
+    if (~eq_wait[19] && eq_val[14] && eq_val[10]) begin
+      fe2_subtraction(19, t[3], o_g2_jb.x);
+    end else
+    if (~eq_wait[22] && eq_val[20] && eq_val[21]) begin
+      fe2_subtraction(22, o_g2_jb.y, t[2]);
+    end else
+    if (~eq_wait[25] && eq_val[24]) begin
+      fe2_subtraction(25, 0, t[3]);
+    end else
+    if (~eq_wait[27] && eq_val[26] && eq_val[1]) begin
+      fe2_subtraction(27, t[6], t[0]);
+    end else
+    if (~eq_wait[28] && eq_val[27] && eq_val[12]) begin
+      fe2_subtraction(28, t[6], t[5]);
+    end else
+    if (~eq_wait[30] && eq_val[29] && eq_val[28]) begin
+      fe2_subtraction(30, t[6], t[1]);
+    end
+
+    // Issue final fe multiplications
+    if (~eq_wait[33] && eq_val[31]) begin
+      fe_multiply(33, t[0][0], i_g1_af.y);
+    end else
+    if (~eq_wait[34] && eq_val[31]) begin
+      fe_multiply(34, t[0][1], i_g1_af.y);
+    end else
+    if (~eq_wait[35] && eq_val[25]) begin
+      fe_multiply(35, t[3][0], i_g1_af.x);
+    end else
+    if (~eq_wait[36] && eq_val[25]) begin
+      fe_multiply(36, t[3][1], i_g1_af.x);
+    end
+
+  end
+end
+
+// Task for subtractions
+task fe2_subtraction(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_sub_fe2_if.val || (o_sub_fe2_if.val && o_sub_fe2_if.rdy)) begin
+    o_sub_fe2_if.val <= 1;
+    o_sub_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_sub_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_sub_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for addition
+task fe2_addition(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_add_fe2_if.val || (o_add_fe2_if.val && o_add_fe2_if.rdy)) begin
+    o_add_fe2_if.val <= 1;
+    o_add_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_add_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_add_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for using mult
+task fe2_multiply(input int unsigned ctl, input FE2_TYPE a, b);
+  if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin
+    o_mul_fe2_if.val <= 1;
+    o_mul_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= a;
+    o_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= b;
+    o_mul_fe2_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+// Task for using mult (fe)
+task fe_multiply(input int unsigned ctl, input FE_TYPE a, b);
+  if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin
+    o_mul_fe_if.val <= 1;
+    o_mul_fe_if.dat[0 +: $bits(FE_TYPE)] <= a;
+    o_mul_fe_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= b;
+    o_mul_fe_if.ctl[OVR_WRT_BIT +: NUM_OVR_WRT_BIT] <= ctl;
+    eq_wait[ctl] <= 1;
+  end
+endtask
+
+endmodule
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@ -27,7 +27,8 @@ package bls12_381_pkg;
  fe_t Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
  fe_t Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;

-  logic [63:0] ATE_X = 64'hd201000000010000;
+  localparam [63:0] ATE_X = 64'hd201000000010000;
+  localparam ATE_X_START = 63;

  typedef enum logic [2:0] {
    SCALAR = 0,
@ -61,27 +62,43 @@ package bls12_381_pkg;
    fe_t x;
  } jb_point_t;

+  // Affine points
+  typedef struct packed {
+    fe_t y;
+    fe_t x;
+  } af_point_t;
+
  typedef fe_t  [1:0] fe2_t;
  typedef fe2_t [2:0] fe6_t;
  typedef fe6_t [1:0] fe12_t;

-  fe2_t G2x = {381'd3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758,
-               381'd352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160};
+  fe2_t G2x = {381'h13e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e,
+               381'h024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8};

-  fe2_t G2y = {381'd927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582,
-               381'd1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905};
+  fe2_t G2y = {381'h606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be,
+               381'hce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a76d429a695160d12c923ac9cc3baca289e193548608b82801};

-  fe2_t FE2_one =  {381'd0, 381'd1};
+  fe2_t FE2_one = {381'd0, 381'd1};
+  fe2_t FE2_zero = {381'd0, 381'd0};
+  fe6_t FE6_one = {FE2_zero, FE2_zero, FE2_one};
+  fe6_t FE6_zero = {FE2_zero, FE2_zero, FE2_zero};
+  fe12_t FE12_one = {FE6_zero, FE6_one};
+  fe12_t FE12_zero = {FE6_zero, FE6_zero};

  jb_point_t g_point = '{x:Gx, y:Gy, z:381'd1};

-  // Jacobian coordinates for Fp^2 elements
+  // Jacobian coordinates for Fp^2, Fp^12 elements
  typedef struct packed {
    fe2_t z;
    fe2_t y;
    fe2_t x;
  } fp2_jb_point_t;

+  typedef struct packed {
+    fe2_t y;
+    fe2_t x;
+  } fp2_af_point_t;
+
  typedef struct packed {
    fe12_t z;
    fe12_t y;
@ -477,23 +494,213 @@ package bls12_381_pkg;
     fe12_mul[0] = fe6_add(bb, aa); // 8. fe6_mul[0] = add(add_i0, bb) [0, 1, 7]
   endfunction

-   function fp12_jb_point_t untiwst(fp2_jb_point_t P);

+   // This performs the miller loop
+   // P is an affine Fp point in G1
+   // Q is an affine Fp^2 point in G2 on the twisted curve
+   // f is a Fp^12 element, the result of the miller loop
+  task miller_loop(input af_point_t P, input fp2_af_point_t Q, output fe12_t f);
+    fp2_jb_point_t R;
+    fe12_t lv_d, lv_a, f_sq;
+    logic found_one = 0;
+    f = FE12_one;
+    R.x = Q.x;
+    R.y = Q.y;
+    R.z = 1;
+
+    for (int i = ATE_X_START; i >= 0; i--) begin
+    
+      if (found_one == 0) begin
+        found_one = ATE_X[i];
+        continue;
+      end
+      
+    
+      miller_double_step(R, P, lv_d);
+
+      if (ATE_X[i] == 1) begin
+        miller_add_step(R, Q, P, lv_a);
+        lv_d = fe12_mul(lv_d, lv_a);  // Very sparse multiplication
+      end
+
+      f_sq = fe12_mul(f, f);    // Full multiplication
+      f = fe12_mul(f_sq, lv_d); // Sparse multiplication
+
+    end
+
+    f[1] = fe6_sub(0, f[1]);
+
+  endtask
+
+   // This performs both the line evaluation and the doubling
+   // Returns a sparse f12 element
+  task automatic miller_double_step(ref fp2_jb_point_t R, input af_point_t P, ref fe12_t f);
+    fe2_t t0, t1, t2, t3, t4, t5, t6, zsquared;
+
+     zsquared = fe2_mul(R.z, R.z); // 0.  [R.val]
+     t0 = fe2_mul(R.x, R.x); // 1. [R.val]
+     t4 = fe2_add(t0, t0); // 2. [1]
+     t4 = fe2_add(t4, t0); // 3. [2]
+
+     t1 = fe2_mul(R.y, R.y); // 4. [R.val]
+     t2 = fe2_mul(t1, t1); // 5. [4]
+     t3 = fe2_add(R.x, t1); // 6. [4]
+     t3 = fe2_mul(t3, t3); // 7. [6]
+     t3 = fe2_sub(t3, t0); // 8. [7, 1]
+
+     t3 = fe2_sub(t3, t2); // 9. [8, 5]
+
+     t3 = fe2_add(t3, t3); // 10. [9]
+
+     t6 = fe2_add(R.x, t4); // 11. [3]
+
+     t5 = fe2_mul(t4, t4); // 12. [3]
+
+     R.x = fe2_sub(t5, t3); // 13. [12, 10]
+     R.x = fe2_sub(R.x, t3); // 14. [13]
+
+     R.z = fe2_add(R.z, R.y); // 15. [R.val ]
+     R.z = fe2_mul(R.z, R.z); // 16. [15]
+     R.z = fe2_sub(R.z, t1); // 17. [15, 4]
+     R.z = fe2_sub(R.z, zsquared); // 18. [17, 0]
+
+     R.y = fe2_sub(t3, R.x); // 19. [14, 10]
+     R.y = fe2_mul(R.y, t4); // 20. [19, 2],
+
+     t2 = fe2_mul(t2, 8); // 21. [9 wait]
+
+     R.y = fe2_sub(R.y, t2); // 22. [20, 21]
+
+     t3 = fe2_mul(t4, zsquared); // 23. [0, 2, wait 14]
+     t3 = fe2_add(t3, t3); // 24. [23]
+     t3 = fe2_sub(0, t3); // 25. [24]
+
+     t6 = fe2_mul(t6, t6); // 26. [11]
+     t6 = fe2_sub(t6, t0); // 27. [26, 1]
+     t6 = fe2_sub(t6, t5); // 28. [27, 12]
+
+     t1 = fe2_mul(4, t1); // 29. [wait 17, 4, wait 5, wait 6]
+
+     t6 = fe2_sub(t6, t1); // 30. [29, 28]
+
+     t0 = fe2_mul(R.z, zsquared); // 31. [0]
+     t0 = fe2_add(t0, t0); // 32. [31]
+
+     t0[0]  = fe_mul(t0[0], P.y); // 33. [P val, 31]
+     t0[1]  = fe_mul(t0[1], P.y); // 34. [P val, 31]
+     t3[0]  = fe_mul(t3[0], P.x); // 35. [P val, 25]
+     t3[1]  = fe_mul(t3[1], P.x); // 36. [P val, 25]
+
+     f = {{FE2_zero, t0, FE2_zero}, {FE2_zero, t3, t6}}; // [33, 34, 35, 36, 30]
+   endtask
+
+   // This performs both the line evaluation and the addition
+   task automatic miller_add_step(ref fp2_jb_point_t R, input fp2_af_point_t Q, input af_point_t P, ref fe12_t f);
+     fe2_t zsquared, ysquared, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+
+     zsquared = fe2_mul(R.z, R.z); // 0. [R.val]
+     ysquared = fe2_mul(Q.y, Q.y); // 1. [Q.val]
+
+     t0 = fe2_mul(zsquared, Q.x); // 2. [0]
+
+     t1 = fe2_add(R.z, Q.y); // 3. [R.val]
+     t1 = fe2_mul(t1, t1); // 4. [3]
+     t1 = fe2_sub(t1, ysquared); // 5. [4, 1]
+     t1 = fe2_sub(t1, zsquared); // 6. [5, 0]
+     t1 = fe2_mul(t1, zsquared); // 7. [6]
+
+     t2 = fe2_sub(t0, R.x); // 8. [2, R.val]
+
+     t3 = fe2_mul(t2, t2); // 9. [8]
+
+     t4 = fe2_mul(t3, 4); // 10. [9]
+
+     t5 = fe2_mul(t4, t2); // 11. [10, 8]
+
+     t6 = fe2_sub(t1, R.y); // 12. [3]
+     t6 = fe2_sub(t6, R.y); // 13. [12]
+
+     t9 = fe2_mul(t6, Q.x); // 14. [13]
+
+     t7 = fe2_mul(t4, R.x); // 15. [10]
+
+     R.x = fe2_mul(t6, t6); // 16. [13]
+     R.x = fe2_sub(R.x, t5); // 17. [11, 16]
+     R.x = fe2_sub(R.x, t7); // 18. [17, 10]
+     R.x = fe2_sub(R.x, t7); // 19. [18, 15]
+
+     R.z = fe2_add(R.z, t2); // 20. [8]
+     R.z = fe2_mul(R.z, R.z); // 21. [20]
+     R.z = fe2_sub(R.z, zsquared); // 22. [21, 0]
+     R.z = fe2_sub(R.z, t3); // 23. [22, 9]
+
+     zsquared = fe2_mul(R.z, R.z);// 24. [23]
+
+     t10 = fe2_add(Q.y, R.z); // 25.[23]
+     t8 = fe2_sub(t7, R.x); // 26. [19, 15]
+     t8 = fe2_mul(t8, t6); // 27. [26, 13]
+
+     t0 = fe2_mul(R.y, t5); // 28. [11]
+     t0 = fe2_add(t0, t0); // 29. [28]
+
+     R.y = fe2_sub(t8, t0); // 30. [29, 27]
+
+     t10 = fe2_mul(t10, t10); // 31. [23]
+     t10 = fe2_sub(t10, ysquared); // 32. [31, 1]
+
+     t10 = fe2_sub(t10, zsquared); // 33. [32, 24]
+
+     t9 = fe2_add(t9, t9); // 34. [14]
+     t9 = fe2_sub(t9, t10); // 35. [34, 33]
+
+     t10 = fe2_add(R.z, R.z); // 36. [wait 35, 23]
+
+     t6 = fe2_sub(0, t6); // 37. [wait 27]
+     t1 = fe2_add(t6, t6); // 38. [37]
+
+     t10[0]  = fe_mul(t10[0], P.y); // 39. [36]
+     t10[1]  = fe_mul(t10[1], P.y); // 40. [36]
+     t1[0]  = fe_mul(t1[0], P.x); // 41. [38]
+     t1[1]  = fe_mul(t1[1], P.x); // 42. [38]
+
+     f = {{FE2_zero, t10, FE2_zero}, {FE2_zero, t1, t9}};
+   endtask
+
+   // Calculates the final exponent used in ate pairing
+   /*task automatic final_exponent(ref fe12_t f);
+     f = fe12_sub(0, f); // TODO can remove this?
+
+
+   endtask*/
+
+   // Sparse multiplication by coefficients 0,1,4
+   function fe12_t f12_sparse_mul_014(fe12_t f, fe2_t c0, c1, c4);
+     fe6_t aa, bb;
+     fe2_t t;
+     aa = fe6_mul(f[0], {FE2_zero, c1, c0});      // TODO implement sparse fp6
+     bb = fe6_mul(f[1], {FE2_zero, c4, FE2_zero});  // TODO implement sparse fp6
+     t = fe2_add(c1, c4);
+     f[1] = fe6_add(f[1], f[0]);
+     f[1] = fe6_mul(f[1], {FE2_zero, t, c0});
+     f[1] = fe6_sub(f[1], aa);
+     f[1] = fe6_sub(f[1], bb);
+     f[0] = fe6_mul_by_nonresidue(bb);
+     f[0] = fe6_add(f[0], aa);
+
+     return f;
   endfunction

-   function jb_point_t to_affine(jb_point_t p);
+   function af_point_t to_affine(jb_point_t p);
     fe_t z_;
     z_ = fe_mul(p.z, p.z);
-     to_affine.z = 1;
     to_affine.x = fe_mul(p.x, fe_inv(z_));
     z_ = fe_mul(z_, p.z);
     to_affine.y = fe_mul(p.y, fe_inv(z_));
   endfunction

-   function fp2_jb_point_t fp2_to_affine(fp2_jb_point_t p);
+   function fp2_af_point_t fp2_to_affine(fp2_jb_point_t p);
     fe2_t z_;
     z_ = fe2_mul(p.z, p.z);
-     fp2_to_affine.z = FE2_one;
     fp2_to_affine.x = fe2_mul(p.x, fe2_inv(z_));
     z_ = fe2_mul(z_, p.z);
     fp2_to_affine.y = fe2_mul(p.y, fe2_inv(z_));
--- a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
@ -0,0 +1,129 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+
+module bls12_381_pairing_tb ();
+
+import common_pkg::*;
+import bls12_381_pkg::*;
+
+parameter type FE_TYPE   = bls12_381_pkg::fe_t;
+parameter type FE2_TYPE  = bls12_381_pkg::fe2_t;
+parameter type FE12_TYPE = bls12_381_pkg::fe12_t;
+parameter P              = bls12_381_pkg::P;
+
+localparam CTL_BITS = 32;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+if_axi_stream #(.DAT_BYTS(($bits(af_point_t) + $bits(fp2_af_point_t)+7)/8), .CTL_BITS(CTL_BITS)) in_if(clk);
+if_axi_stream #(.DAT_BYTS(($bits(fe12_t)+7)/8), .CTL_BITS(CTL_BITS)) out_if(clk);
+
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_in_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_out_if(clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) add_fe2_in_if (clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) add_fe2_out_if (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe2_in_if (clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe2_out_if (clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_in_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_out_if(clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE12_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_in_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FE12_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_out_if(clk);
+
+bls12_381_pairing #(
+  .FE_TYPE     ( FE_TYPE   ),
+  .FE2_TYPE    ( FE2_TYPE  ),
+  .FE12_TYPE   ( FE12_TYPE ),
+  .CTL_BITS    ( CTL_BITS  ),
+  .OVR_WRT_BIT ( 0         )
+)
+bls12_381_pairing (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_val ( in_if.val ),
+  .o_rdy ( in_if.rdy ),
+  .i_g1_af ( in_if.dat[0 +: $bits(af_point_t)] ),
+  .i_g2_af ( in_if.dat[$bits(af_point_t) +: $bits(fp2_af_point_t)] ),
+  .o_val  ( out_if.val ),
+  .i_rdy  ( out_if.rdy ),
+  .o_fe12 ( out_if.dat ),
+  .o_mul_fe2_if ( mul_fe2_in_if ),
+  .i_mul_fe2_if ( mul_fe2_out_if ),
+  .o_add_fe2_if ( add_fe2_in_if ),
+  .i_add_fe2_if ( add_fe2_out_if ),
+  .o_sub_fe2_if ( sub_fe2_in_if ),
+  .i_sub_fe2_if ( sub_fe2_out_if ),
+  .o_mul_fe12_if ( mul_fe12_in_if ),
+  .i_mul_fe12_if ( mul_fe12_out_if ),
+  .o_mul_fe_if ( mul_fe_in_if ),
+  .i_mul_fe_if ( mul_fe_out_if )
+);
+
+always_comb begin
+  out_if.sop = 1;
+  out_if.eop = 1;
+end
+
+initial begin
+  af_point_t P;
+  fp2_af_point_t Q;
+  fe12_t f, f_exp;
+
+  in_if.reset_source();
+  out_if.rdy = 0;
+  #100ns;
+
+  P.x = Gx;
+  P.y = Gy;
+  Q.x = G2x;
+  Q.y = G2y;
+
+  f = FE12_zero;
+  f_exp =  {381'h1562633d4f2387ff79a0f625a6989072296a946ca6bbfa3fef879defde15ed96d205b2eebb454f48fb76fa8a845bcba7,
+            381'h1868172fbbeb861d69c6c10f315c273d08312812c643dbf60588d0de3d2c4b3e9b21acd402f7ddee53f1c4797646ba96,
+            381'h07508024863ec263bded120e45deb29c1f1303a056b279e116cb5fdb03013db19f81e78fa2b2b409cb2ce8e3ba96f4e6,
+            381'h1431225e128c5e2bfafb9eba23746150907688583f52e07fcde4cc93452b0c2bcd0f0893b48a696c403c6980d0940741,
+            381'h159bfbbdc31bb5cb0082c59e5f744773335ef1fdddb8ed86a1c23f61f18800b647ff7dae335fb9ab5fcf2188cb64d72d,
+            381'h05d928cb508feeb3329e51aa0bec4f33ba865a22da5a4e97eb31b78c0150c0c6134f0f94bd0154b28430ee4c6052e82b,
+            381'h087d1320fe5bad5c2d8e12c49e6aff41a0b80e1497bbe85682e22ed853f256041bdf97ef02bdb5d80a5f9bc31d85f25e,
+            381'h159ef660e2d84185f55c0ccae1dd7f8f71b12c0beb7a431fede9e62794d9154e9a0ce4715f64b032492459076224c99b,
+            381'h0cbc592a19a3f60c9938676b257b9c01ed9d708f9428b29e272a811d13d734485970d9d3f1c097b12bfa3d1678096b1d,
+            381'h0751a051e0beb4a0e2351a7527d813b371e189056307d718a446e4016a3df787568a842f3401768dc03b966bd1db90ac,
+            381'h0e760e96f911ae38a6042da82d7b0e30787864e725e9d5462d224c91c4497104d838d566d894564bc19e09d8af706c3f,
+            381'h05194f5785436c8debf0eb2bab4c6ef3de7dc0633c85769173777b782bf897fa45025fd03e7be941123c4ee19910e62e};
+            
+  miller_loop(P, Q, f);
+
+  assert(f == f_exp) else $fatal(1, "Miller loop did not match result");
+  print_fe12(f);
+
+
+  #1us $finish();
+end
+endmodule