[Cryptech-Commits] [core/math/modexpng] 58/92: Turns out, fabric addition and subtraction in the general worker module are actually in the critical paths of the ModExpNG core and are plaguing the place and route tools. I was barely able to achieve timing closure at 180 MHz even with the highest Map and PaR effort levels. This means that any further clock frequency increase is effectively impossible, moreover any small change in the design may prevent it from meeting timing constants. The obvious solution is to use DSP slices not only f [...]

git at cryptech.is git at cryptech.is
Sat Mar 14 18:19:37 UTC 2020


This is an automated email from the git hooks/post-receive script.

paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.

commit 6a0438e33fa300822216c259668180f177ac0343
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jan 16 15:09:59 2020 +0300

    Turns out, fabric addition and subtraction in the general worker module are
    actually in the critical paths of the ModExpNG core and are plaguing the place
    and route tools. I was barely able to achieve timing closure at 180 MHz even
    with the highest Map and PaR effort levels. This means that any further clock
    frequency increase is effectively impossible, moreover any small change in the
    design may prevent it from meeting timing constants. The obvious solution is to
    use DSP slices not only for modular multiplication, but also for supporting
    math operations. When fully pipelined, they can be clocked even faster then the
    block memory, so there definitely should not be any timing problems with them.
    The general worker module does three things that currently involve fabric-based
    math operations:
     * carry propagation (conversion to non-redundant repsesentation)
     * modular subtraction
     * regular addition
    This commit adds four DSP slice instances and makes the carry propagation
    opcode use DSP slice products instead of fabric logic.
---
 rtl/modexpng_general_worker.v | 172 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 161 insertions(+), 11 deletions(-)

diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v
index 0620bd6..684af5a 100644
--- a/rtl/modexpng_general_worker.v
+++ b/rtl/modexpng_general_worker.v
@@ -54,7 +54,9 @@ module modexpng_general_worker
     //
     `include "modexpng_parameters.vh"
     `include "modexpng_microcode.vh"
-
+    `include "modexpng_dsp48e1.vh"
+    `include "modexpng_dsp_slice_primitives.vh"
+    
     
     //
     // Ports
@@ -240,8 +242,8 @@ module modexpng_general_worker
     //
     // Delays
     //
-    reg [OP_ADDR_W -1:0] rd_narrow_addr_x_dly[0:3];
-    reg [OP_ADDR_W -1:0] rd_narrow_addr_y_dly[0:3];
+    reg [OP_ADDR_W -1:0] rd_narrow_addr_x_dly[0:4];
+    reg [OP_ADDR_W -1:0] rd_narrow_addr_y_dly[0:4];
 
     reg [OP_ADDR_W -1:0] rd_wide_addr_x_dly[0:3];
     reg [OP_ADDR_W -1:0] rd_wide_addr_y_dly[0:3];
@@ -255,6 +257,11 @@ module modexpng_general_worker
     reg [WORD_EXT_W -1:0] rd_narrow_x_din_y_dly1;
     reg [WORD_EXT_W -1:0] rd_narrow_y_din_y_dly1;
     
+    reg                   rd_narrow_ena_x_dly1 = 1'b0;
+    reg                   rd_narrow_ena_y_dly1 = 1'b0;
+    reg                   rd_narrow_ena_x_dly2 = 1'b0;
+    reg                   rd_narrow_ena_y_dly2 = 1'b0;
+    
     always @(posedge clk) begin
         //
         {rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x};
@@ -267,12 +274,15 @@ module modexpng_general_worker
         {rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y};
         {rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y};
         //
-        {rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0]} <= {rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0], rd_narrow_addr_x};
-        {rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0]} <= {rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0], rd_narrow_addr_y};
+        {rd_narrow_addr_x_dly[4], rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0]} <= {rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0], rd_narrow_addr_x};
+        {rd_narrow_addr_y_dly[4], rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0]} <= {rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0], rd_narrow_addr_y};
         //
         {rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0]} <= {rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0], rd_wide_addr_x};
         {rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0]} <= {rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0], rd_wide_addr_y};
         //
+        {rd_narrow_ena_x_dly2, rd_narrow_ena_x_dly1} <= {rd_narrow_ena_x_dly1, rd_narrow_ena_x};
+        {rd_narrow_ena_y_dly2, rd_narrow_ena_y_dly1} <= {rd_narrow_ena_y_dly1, rd_narrow_ena_y};
+        //
     end
     
   
@@ -376,7 +386,14 @@ module modexpng_general_worker
             //
             case (opcode)
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES,
+                UOP_OPCODE_PROPAGATE_CARRIES:
+                    //
+                    case (wrk_fsm_state)
+                        WRK_FSM_STATE_BUSY2,
+                        WRK_FSM_STATE_LATENCY_POST2,
+                        WRK_FSM_STATE_LATENCY_POST4: enable_narrow_wr_en;
+                    endcase
+                //
                 UOP_OPCODE_MODULAR_SUBTRACT_X,
                 UOP_OPCODE_MERGE_LH,
                 UOP_OPCODE_REGULAR_ADD_UNEVEN:
@@ -729,7 +746,14 @@ module modexpng_general_worker
         //
         case (opcode)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_PROPAGATE_CARRIES:
+                //
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2,
+                    WRK_FSM_STATE_LATENCY_POST4: update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[4], rd_narrow_addr_y_dly[4]);
+                endcase
+            //
             UOP_OPCODE_MODULAR_SUBTRACT_X,
             UOP_OPCODE_MERGE_LH,
             UOP_OPCODE_REGULAR_ADD_UNEVEN:
@@ -773,6 +797,131 @@ module modexpng_general_worker
     end
  
     
+    
+    //
+    // DSP Slice Array
+    //
+    wire [DSP48E1_C_W-1:0] dsp_x_x_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_x_din_x_dly1}; 
+    wire [DSP48E1_C_W-1:0] dsp_y_x_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_y_din_x_dly1};
+    wire [DSP48E1_C_W-1:0] dsp_x_y_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_x_din_y_dly1};
+    wire [DSP48E1_C_W-1:0] dsp_y_y_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_y_din_y_dly1};
+    
+    wire [DSP48E1_C_W-1:0] dsp_x_x_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_x_din_x_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_x_din_x_dly1[WORD_W-1:0]}; 
+    wire [DSP48E1_C_W-1:0] dsp_y_x_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_y_din_x_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_y_din_x_dly1[WORD_W-1:0]};
+    wire [DSP48E1_C_W-1:0] dsp_x_y_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_x_din_y_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_x_din_y_dly1[WORD_W-1:0]};
+    wire [DSP48E1_C_W-1:0] dsp_y_y_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_y_din_y_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_y_din_y_dly1[WORD_W-1:0]};
+
+    wire [DSP48E1_P_W-1:0] dsp_x_x_p;
+    wire [DSP48E1_P_W-1:0] dsp_y_x_p;
+    wire [DSP48E1_P_W-1:0] dsp_x_y_p;
+    wire [DSP48E1_P_W-1:0] dsp_y_y_p;
+    
+    wire [WORD_EXT_W-1:0] dsp_x_x_p_reduced = {CARRY_ZERO, dsp_x_x_p[WORD_W-1:0]}; 
+    wire [WORD_EXT_W-1:0] dsp_y_x_p_reduced = {CARRY_ZERO, dsp_y_x_p[WORD_W-1:0]};
+    wire [WORD_EXT_W-1:0] dsp_x_y_p_reduced = {CARRY_ZERO, dsp_x_y_p[WORD_W-1:0]};
+    wire [WORD_EXT_W-1:0] dsp_y_y_p_reduced = {CARRY_ZERO, dsp_y_y_p[WORD_W-1:0]};
+    
+    reg                        dsp_ce_x = 1'b0;
+    reg                        dsp_ce_y = 1'b0;
+    reg                        dsp_ce_x_dly = 1'b0;
+    reg                        dsp_ce_y_dly = 1'b0;
+    reg [DSP48E1_OPMODE_W-1:0] dsp_opmode_x;
+    reg [DSP48E1_OPMODE_W-1:0] dsp_opmode_y;
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n) {dsp_ce_x, dsp_ce_y} <= {1'b0, 1'b0};
+        else case (opcode)
+            //
+            UOP_OPCODE_PROPAGATE_CARRIES: {dsp_ce_x, dsp_ce_y} <= {rd_narrow_ena_x_dly2, rd_narrow_ena_y_dly2};
+            default:                      {dsp_ce_x, dsp_ce_y} <= {1'b0, 1'b0};
+            //
+        endcase
+    
+    always @(posedge clk) begin
+        //
+        dsp_opmode_x <= {DSP48E1_OPMODE_W{1'bX}};
+        dsp_opmode_y <= {DSP48E1_OPMODE_W{1'bX}};
+        //
+        if (rd_narrow_ena_x_dly2)
+            //
+            case (opcode)
+                //
+                UOP_OPCODE_PROPAGATE_CARRIES: if (rd_narrow_addr_x_dly[1] == OP_ADDR_ZERO) dsp_opmode_x <= DSP48E1_OPMODE_Z0_YC_X0;
+                                              else                                         dsp_opmode_x <= DSP48E1_OPMODE_ZP17_YC_X0;
+                //
+            endcase
+        //
+        if (rd_narrow_ena_y_dly2)
+            //
+            case (opcode)
+                //
+                UOP_OPCODE_PROPAGATE_CARRIES: if (rd_narrow_addr_y_dly[1] == OP_ADDR_ZERO) dsp_opmode_y <= DSP48E1_OPMODE_Z0_YC_X0;
+                                              else                                         dsp_opmode_y <= DSP48E1_OPMODE_ZP17_YC_X0;
+                //
+            endcase
+        //
+    end
+    
+    always @(posedge clk) {dsp_ce_x_dly, dsp_ce_y_dly} <= {dsp_ce_x, dsp_ce_y};
+    
+    `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_x_x
+    (
+        .clk        (clk),
+        .ce_abc     (dsp_ce_x),
+        .ce_p       (dsp_ce_x_dly),
+        .ce_opmode  (dsp_ce_x),
+        .x          (dsp_x_x_x),
+        .y          (dsp_x_x_y),
+        .p          (dsp_x_x_p),
+        .opmode     (dsp_opmode_x),
+        .casc_p_in  (),
+        .casc_p_out ()
+    );
+    
+    `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_y_x
+    (
+        .clk        (clk),
+        .ce_abc     (dsp_ce_x),
+        .ce_p       (dsp_ce_x_dly),
+        .ce_opmode  (dsp_ce_x),
+        .x          (dsp_y_x_x),
+        .y          (dsp_y_x_y),
+        .p          (dsp_y_x_p),
+        .opmode     (dsp_opmode_x),
+        .casc_p_in  (),
+        .casc_p_out ()
+    );
+    
+    `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_x_y
+    (
+        .clk        (clk),
+        .ce_abc     (dsp_ce_y),
+        .ce_p       (dsp_ce_y_dly),
+        .ce_opmode  (dsp_ce_y),
+        .x          (dsp_x_y_x),
+        .y          (dsp_x_y_y),
+        .p          (dsp_x_y_p),
+        .opmode     (dsp_opmode_y),
+        .casc_p_in  (),
+        .casc_p_out ()
+    );
+    
+    `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_y_y
+    (
+        .clk        (clk),
+        .ce_abc     (dsp_ce_y),
+        .ce_p       (dsp_ce_y_dly),
+        .ce_opmode  (dsp_ce_y),
+        .x          (dsp_y_y_x),
+        .y          (dsp_y_y_y),
+        .p          (dsp_y_y_p),
+        .opmode     (dsp_opmode_y),
+        .casc_p_in  (),
+        .casc_p_out ()
+    );
+    
+    
     //
     // UOP_OPCODE_PROPAGATE_CARRIES
     //
@@ -1171,11 +1320,12 @@ module modexpng_general_worker
                 //
                 case (wrk_fsm_state)
                     //
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST3:
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2,
+                    WRK_FSM_STATE_LATENCY_POST4:
                         //
-                        update_narrow_dout(propagate_carries_x_x_w_cry_reduced, propagate_carries_y_x_w_cry_reduced, propagate_carries_x_y_w_cry_reduced, propagate_carries_y_y_w_cry_reduced);
+                        //update_narrow_dout(propagate_carries_x_x_w_cry_reduced, propagate_carries_y_x_w_cry_reduced, propagate_carries_x_y_w_cry_reduced, propagate_carries_y_y_w_cry_reduced);
+                        update_narrow_dout(dsp_x_x_p_reduced, dsp_y_x_p_reduced, dsp_x_y_p_reduced, dsp_y_y_p_reduced);
                     // 
                 endcase
             //



More information about the Commits mailing list