Modify bls12-381 pairing engine .rdy signal so that it is not

combinatorial, to improve timing. Update version to 1.2.2
2019-08-22 16:31:33 +08:00 · 2019-08-22 16:31:33 +08:00 · 3a8c799a74
parent dbed8ccb0d
commit 3a8c799a74
3 changed files with 30 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -1,17 +1,21 @@
 The work in this repo is the result of a Zcash foundation grant to develop open-source FPGA code that can be used to accelerate various aspects of the network.
 **An Architecture document is [here](zcash_fpga_design_doc_v1.1.x.pdf)**.

-While mainly developed for Equihash and the secp256k1 and bls12-381 curves, the code used in this repo can also be applied with minimum modification to other curves.
+While mainly developed for Equihash and the secp256k1 and bls12-381 curves, the code (ip_cores) used in this repo can also be applied to other curves by
+changing parameters / minimum modification to equations.

-** Currently still a work in progress
+# Getting started
+
+The architecture document has instructions for building an AWS image or simulating the top level design. The easiest way is to add all .sv and .xci files to a new Vivado project,
+and then set the top level _tb.sv file to the module you want to test.

 # Repo folder structure

 Each top level folder is explained below. Inside each folder is source code written in systemverilog, and most blocks have a standalone self-checking testbench.

-## aws
+## AWS

-This contains the top / project files for building on a AWS (Amazon FPGA VU9P w/ 64GB DDR4).
+This contains the top / project files for building on a AWS F1 instance (Amazon FPGA VU9P w/ 64GB DDR4).

 * This contains the zcash_fpga library (aws/cl_zcash/software/runtime/zcash_fpga.hpp) that can be used to interface with the FPGA over PCIe.
 * Instructions on how to build are in the architecture document.
@ -22,7 +26,7 @@ This contains the top / project files for building on the Bittware VVH board (VU

 ## ip_cores

-These contain shared IP cores used by the projects in this repo. These include many functions, such as:
+These contain shared IP cores that are used by the projects in this repo. These include many functions, such as:

 * Hashing
  - Blake2b - single pipe implementation of blake2b and a pipline-unrolled version for high performance (single clock hash @ 200MHz after initial 52 clock delay).
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
@ -116,9 +116,6 @@ logic point_mul_mode, found_one;
 FE_TYPE key;

 always_comb begin
-  dbl_f12_o_if.rdy = pair_state == POINT_MULT_DBL || (f_val && (~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy)) && ((out_cnt/2 == 0) || (out_cnt/2 == 1) || (out_cnt/2 == 4))); // As this is a sparse f12 using full f12_mul
-  add_f12_o_if.rdy = pair_state == POINT_MULT_ADD || (f_val && (~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy)) && ((out_cnt/2 == 0) || (out_cnt/2 == 1) || (out_cnt/2 == 4))); // As this is a sparse f12 using full f12_mul
-
  final_exp_fe12_o_if.dat = f[0][0][0];
  final_exp_fe12_o_if.err = 0;
  final_exp_fe12_o_if.ctl = 0;
@ -153,6 +150,9 @@ always_ff @ (posedge i_clk) begin
    stage_done <= 0;
    
    o_p_jb_if.reset_source();
+    
+    dbl_f12_o_if.rdy <= 0;
+    add_f12_o_if.rdy <= 0;

  end else begin

@ -166,6 +166,9 @@ always_ff @ (posedge i_clk) begin
      f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
      f_val <= mul_fe12_i_if[0].eop;
    end
+    
+    dbl_f12_o_if.rdy <= 0;
+    add_f12_o_if.rdy <= 0;

    case(pair_state)
      IDLE: begin
@ -229,36 +232,41 @@ always_ff @ (posedge i_clk) begin
            end
          end
          1: begin // Multiply by double result
-            if(~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy)) begin
+            if(~dbl_f12_o_if.rdy && (~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy))) begin
              if ((dbl_f12_o_if.val && f_val) || (out_cnt/2 == 5)) begin
                mul_fe12_o_if[0].sop <= out_cnt == 0;
                mul_fe12_o_if[0].eop <= out_cnt == 11;
-                mul_fe12_o_if[0].val <= 1;
+                mul_fe12_o_if[0].val <= dbl_f12_o_if.val || (out_cnt/2 == 2) || (out_cnt/2 == 3) || (out_cnt/2 == 5);
+                dbl_f12_o_if.rdy <= (out_cnt/2 == 0) || (out_cnt/2 == 1) || (out_cnt/2 == 4);
                case (out_cnt/2) inside
                  0,1,4: mul_fe12_o_if[0].dat <= {dbl_f12_o_if.dat, f[0][0][0]};
                  default: mul_fe12_o_if[0].dat <= {381'd0, f[0][0][0]};
                endcase
+                
                out_cnt <= out_cnt + 1;
                f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
-                mul_fe12_o_if[0].ctl <= miller_mult_cnt;
-                mul_fe12_o_if[0].ctl[SQ_BIT] <= 0;
                if (out_cnt == 11) begin
                  f_val <= 0;
                  out_cnt <= 0;
                  miller_mult_cnt <= ATE_X[ate_loop_cnt] == 0 ? 3 : 2;
-                end
+                end                  
+                 
+                mul_fe12_o_if[0].ctl <= miller_mult_cnt;
+                mul_fe12_o_if[0].ctl[SQ_BIT] <= 0;
+                
              end
            end
          end
          2: begin  // Multiply by add result
-            if(~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy)) begin
+            if(~add_f12_o_if.rdy && (~mul_fe12_o_if[0].val || (mul_fe12_o_if[0].val && mul_fe12_o_if[0].rdy))) begin
              if ((add_f12_o_if.val && f_val) || (out_cnt/2 == 5)) begin
                g2_r_jb_i <= add_g2_o;
                mul_fe12_o_if[0].ctl <= miller_mult_cnt;
                mul_fe12_o_if[0].ctl[SQ_BIT] <= 0;
                mul_fe12_o_if[0].sop <= out_cnt == 0;
                mul_fe12_o_if[0].eop <= out_cnt == 11;
-                mul_fe12_o_if[0].val <= 1;
+                mul_fe12_o_if[0].val <= add_f12_o_if.val || (out_cnt/2 == 2) || (out_cnt/2 == 3) || (out_cnt/2 == 5);
+                add_f12_o_if.rdy <= (out_cnt/2 == 0) || (out_cnt/2 == 1) || (out_cnt/2 == 4);
                out_cnt <= out_cnt + 1;
                case (out_cnt/2) inside
                  0,1,4: mul_fe12_o_if[0].dat <= {add_f12_o_if.dat, f[0][0][0]};
@ -302,6 +310,7 @@ always_ff @ (posedge i_clk) begin
        end
      end
      POINT_MULT_DBL: begin
+        dbl_f12_o_if.rdy <= 1;
        if(found_one == 0) begin
          key <= key << 1;
          ate_loop_cnt <= ate_loop_cnt - 1;
@ -327,6 +336,7 @@ always_ff @ (posedge i_clk) begin
        end
      end
      POINT_MULT_ADD: begin
+        add_f12_o_if.rdy <= 1;
        if (~wait_add) begin
          wait_add <= 1;
          add_i_val <= 1;
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@ -27,7 +27,7 @@ package zcash_fpga_pkg;

  import bls12_381_pkg::point_type_t;

-  parameter FPGA_VERSION = 32'h01_02_01;  //v1.2.1
+  parameter FPGA_VERSION = 32'h01_02_02;  //v1.2.2

  // What features are enabled in this build
  parameter bit ENB_VERIFY_SECP256K1_SIG = 1;