[Cryptech-Commits] [user/shatov/modexpng] 03/21: Reworked modular subtraction micro-operation. Previously it used "two-pass" bank address space sweep, during the first pass (a-b) and (a-b+n) were computed, during the second pass either the former or the latter quantity was written to the output bank (depending on the very last borrow flag value). This is no longer possible, since the FSM now only generates one "interleaved" address space sweep. The solution is to split one complex modular subtraction operation into simpler sub-operations [...]

git at cryptech.is git at cryptech.is
Mon Jan 20 21:18:04 UTC 2020


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit e5f4454e3ac52fa761f301e7d11ad144cd23d590
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jan 16 21:38:04 2020 +0300

    Reworked modular subtraction micro-operation. Previously it used "two-pass"
    bank address space sweep, during the first pass (a-b) and (a-b+n) were
    computed, during the second pass either the former or the latter quantity was
    written to the output bank (depending on the very last borrow flag value).
    This is no longer possible, since the FSM now only generates one "interleaved"
    address space sweep. The solution is to split one complex modular subtraction
    operation into simpler sub-operations. Currently modular subtraction is
    achieved by running a sequence of three micro-operations:
     * MODULAR_SUBTRACT_X computes (a-b) and latches the final borrow flag
     * MODULAR_SUBTRACT_Y computes (a-b+n)
     * MODULAR_SUBTRACT_Z writes either (a-b) or (a-b+n) into the output bank
                          depending on the latched value of the borrow flag
    Unfortunately we can't compute both (a-b) and (a-b+n) during one address space
    sweep, since fully pipelined adder/subtractor DSP slice has 2-cycle latency.
---
 rtl/modexpng_general_worker.v | 569 ++++++++++++++++++++++++------------------
 1 file changed, 332 insertions(+), 237 deletions(-)

diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v
index 684af5a..6652f14 100644
--- a/rtl/modexpng_general_worker.v
+++ b/rtl/modexpng_general_worker.v
@@ -245,8 +245,8 @@ module modexpng_general_worker
     reg [OP_ADDR_W -1:0] rd_narrow_addr_x_dly[0:4];
     reg [OP_ADDR_W -1:0] rd_narrow_addr_y_dly[0:4];
 
-    reg [OP_ADDR_W -1:0] rd_wide_addr_x_dly[0:3];
-    reg [OP_ADDR_W -1:0] rd_wide_addr_y_dly[0:3];
+    reg [OP_ADDR_W -1:0] rd_wide_addr_x_dly[0:4];
+    reg [OP_ADDR_W -1:0] rd_wide_addr_y_dly[0:4];
     
     reg [WORD_EXT_W -1:0] rd_wide_x_din_x_dly1;
     reg [WORD_EXT_W -1:0] rd_wide_y_din_x_dly1;
@@ -277,8 +277,8 @@ module modexpng_general_worker
         {rd_narrow_addr_x_dly[4], rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0]} <= {rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0], rd_narrow_addr_x};
         {rd_narrow_addr_y_dly[4], rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0]} <= {rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0], rd_narrow_addr_y};
         //
-        {rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0]} <= {rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0], rd_wide_addr_x};
-        {rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0]} <= {rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0], rd_wide_addr_y};
+        {rd_wide_addr_x_dly[4], rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0]} <= {rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0], rd_wide_addr_x};
+        {rd_wide_addr_y_dly[4], rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0]} <= {rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0], rd_wide_addr_y};
         //
         {rd_narrow_ena_x_dly2, rd_narrow_ena_x_dly1} <= {rd_narrow_ena_x_dly1, rd_narrow_ena_x};
         {rd_narrow_ena_y_dly2, rd_narrow_ena_y_dly1} <= {rd_narrow_ena_y_dly1, rd_narrow_ena_y};
@@ -386,15 +386,15 @@ module modexpng_general_worker
             //
             case (opcode)
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES:
+                UOP_OPCODE_PROPAGATE_CARRIES,
+                UOP_OPCODE_MODULAR_SUBTRACT_X:
                     //
                     case (wrk_fsm_state)
                         WRK_FSM_STATE_BUSY2,
                         WRK_FSM_STATE_LATENCY_POST2,
                         WRK_FSM_STATE_LATENCY_POST4: enable_narrow_wr_en;
                     endcase
-                //
-                UOP_OPCODE_MODULAR_SUBTRACT_X,
+                //                
                 UOP_OPCODE_MERGE_LH,
                 UOP_OPCODE_REGULAR_ADD_UNEVEN:
                     //
@@ -415,8 +415,15 @@ module modexpng_general_worker
                         WRK_FSM_STATE_LATENCY_POST3: begin enable_wide_wr_en; enable_narrow_wr_en; end
                     endcase
                 //
-                UOP_OPCODE_MODULAR_REDUCE_INIT,
                 UOP_OPCODE_MODULAR_SUBTRACT_Y:
+                    //
+                    case (wrk_fsm_state)
+                        WRK_FSM_STATE_BUSY2,
+                        WRK_FSM_STATE_LATENCY_POST2,
+                        WRK_FSM_STATE_LATENCY_POST4: enable_wide_wr_en;
+                    endcase
+                //
+                UOP_OPCODE_MODULAR_REDUCE_INIT:
                     //
                     case (wrk_fsm_state)
                         WRK_FSM_STATE_BUSY1,
@@ -746,7 +753,8 @@ module modexpng_general_worker
         //
         case (opcode)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES:
+            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
                 case (wrk_fsm_state)
                     WRK_FSM_STATE_BUSY2,
@@ -754,7 +762,6 @@ module modexpng_general_worker
                     WRK_FSM_STATE_LATENCY_POST4: update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[4], rd_narrow_addr_y_dly[4]);
                 endcase
             //
-            UOP_OPCODE_MODULAR_SUBTRACT_X,
             UOP_OPCODE_MERGE_LH,
             UOP_OPCODE_REGULAR_ADD_UNEVEN:
                 //
@@ -787,29 +794,28 @@ module modexpng_general_worker
             UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
                 case (wrk_fsm_state)
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST3: update_wr_wide_bank_addr(sel_wide_out, sel_wide_out, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3]);
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2,
+                    WRK_FSM_STATE_LATENCY_POST4: update_wr_wide_bank_addr(sel_wide_out, sel_wide_out, rd_wide_addr_x_dly[4], rd_wide_addr_y_dly[4]);
                 endcase
             //
         endcase
         //
     end
- 
-    
-    
+
+
     //
     // DSP Slice Array
     //
-    wire [DSP48E1_C_W-1:0] dsp_x_x_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_x_din_x_dly1}; 
-    wire [DSP48E1_C_W-1:0] dsp_y_x_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_y_din_x_dly1};
-    wire [DSP48E1_C_W-1:0] dsp_x_y_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_x_din_y_dly1};
-    wire [DSP48E1_C_W-1:0] dsp_y_y_x = 'bX;//{{(DSP48E1_C_W-WORD_EXT_W){1'b0}}, rd_narrow_y_din_y_dly1};
+    reg [DSP48E1_C_W-1:0] dsp_x_x_x; 
+    reg [DSP48E1_C_W-1:0] dsp_y_x_x;
+    reg [DSP48E1_C_W-1:0] dsp_x_y_x;
+    reg [DSP48E1_C_W-1:0] dsp_y_y_x;
     
-    wire [DSP48E1_C_W-1:0] dsp_x_x_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_x_din_x_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_x_din_x_dly1[WORD_W-1:0]}; 
-    wire [DSP48E1_C_W-1:0] dsp_y_x_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_y_din_x_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_y_din_x_dly1[WORD_W-1:0]};
-    wire [DSP48E1_C_W-1:0] dsp_x_y_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_x_din_y_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_x_din_y_dly1[WORD_W-1:0]};
-    wire [DSP48E1_C_W-1:0] dsp_y_y_y = {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, rd_narrow_y_din_y_dly1[WORD_EXT_W-1:WORD_W], 1'b1, rd_narrow_y_din_y_dly1[WORD_W-1:0]};
+    reg [DSP48E1_C_W-1:0] dsp_x_x_y; 
+    reg [DSP48E1_C_W-1:0] dsp_y_x_y;
+    reg [DSP48E1_C_W-1:0] dsp_x_y_y;
+    reg [DSP48E1_C_W-1:0] dsp_y_y_y;
 
     wire [DSP48E1_P_W-1:0] dsp_x_x_p;
     wire [DSP48E1_P_W-1:0] dsp_y_x_p;
@@ -821,213 +827,314 @@ module modexpng_general_worker
     wire [WORD_EXT_W-1:0] dsp_x_y_p_reduced = {CARRY_ZERO, dsp_x_y_p[WORD_W-1:0]};
     wire [WORD_EXT_W-1:0] dsp_y_y_p_reduced = {CARRY_ZERO, dsp_y_y_p[WORD_W-1:0]};
     
-    reg                        dsp_ce_x = 1'b0;
-    reg                        dsp_ce_y = 1'b0;
-    reg                        dsp_ce_x_dly = 1'b0;
-    reg                        dsp_ce_y_dly = 1'b0;
-    reg [DSP48E1_OPMODE_W-1:0] dsp_opmode_x;
-    reg [DSP48E1_OPMODE_W-1:0] dsp_opmode_y;
+    reg                             dsp_ce_x = 1'b0;
+    reg                             dsp_ce_y = 1'b0;
+    reg                             dsp_ce_x_dly = 1'b0;
+    reg                             dsp_ce_y_dly = 1'b0;
+    reg [    DSP48E1_OPMODE_W -1:0] dsp_op_mode_x;
+    reg [    DSP48E1_OPMODE_W -1:0] dsp_op_mode_y;
+    reg [   DSP48E1_ALUMODE_W -1:0] dsp_alu_mode_x;
+    reg [   DSP48E1_ALUMODE_W -1:0] dsp_alu_mode_y;
+    reg [DSP48E1_CARRYINSEL_W -1:0] dsp_carry_in_sel_x;
+    reg [DSP48E1_CARRYINSEL_W -1:0] dsp_carry_in_sel_y;
+    wire                            dsp_carry_out_x;
+    wire                            dsp_carry_out_y;
+    
+
+    //
+    // DSP - CE
+    //
+    always @(posedge clk) {dsp_ce_x_dly, dsp_ce_y_dly} <= {dsp_ce_x, dsp_ce_y};
     
     always @(posedge clk or negedge rst_n)
         //
         if (!rst_n) {dsp_ce_x, dsp_ce_y} <= {1'b0, 1'b0};
         else case (opcode)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES: {dsp_ce_x, dsp_ce_y} <= {rd_narrow_ena_x_dly2, rd_narrow_ena_y_dly2};
-            default:                      {dsp_ce_x, dsp_ce_y} <= {1'b0, 1'b0};
+            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_MODULAR_SUBTRACT_X,
+            UOP_OPCODE_MODULAR_SUBTRACT_Y: {dsp_ce_x, dsp_ce_y} <= {rd_narrow_ena_x_dly2, rd_narrow_ena_y_dly2};
+            default:                       {dsp_ce_x, dsp_ce_y} <= {1'b0, 1'b0};
             //
         endcase
     
+    
+    //
+    // DSP - OPMODE, ALUMODE, CARRYINSEL
+    //
     always @(posedge clk) begin
         //
-        dsp_opmode_x <= {DSP48E1_OPMODE_W{1'bX}};
-        dsp_opmode_y <= {DSP48E1_OPMODE_W{1'bX}};
+        dsp_op_mode_x <= DSP48E1_OPMODE_DNC;
+        dsp_op_mode_y <= DSP48E1_OPMODE_DNC;
+        //
+        dsp_alu_mode_x <= DSP48E1_ALUMODE_DNC;
+        dsp_alu_mode_y <= DSP48E1_ALUMODE_DNC;
         //
-        if (rd_narrow_ena_x_dly2)
+        dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_DNC;
+        dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_DNC;
+        //
+        case (opcode)
             //
-            case (opcode)
+            UOP_OPCODE_PROPAGATE_CARRIES: begin
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES: if (rd_narrow_addr_x_dly[1] == OP_ADDR_ZERO) dsp_opmode_x <= DSP48E1_OPMODE_Z0_YC_X0;
-                                              else                                         dsp_opmode_x <= DSP48E1_OPMODE_ZP17_YC_X0;
+                if (rd_narrow_ena_x_dly2) begin
+                    if (rd_narrow_addr_x_dly[1] == OP_ADDR_ZERO) dsp_op_mode_x      <= DSP48E1_OPMODE_Z0_YC_X0;
+                    else                                         dsp_op_mode_x      <= DSP48E1_OPMODE_ZP17_YC_X0;
+                                                                 dsp_alu_mode_x     <= DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN;
+                                                                 dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_CARRYIN;
+                end
                 //
-            endcase
-        //
-        if (rd_narrow_ena_y_dly2)
+                if (rd_narrow_ena_y_dly2) begin
+                    if (rd_narrow_addr_y_dly[1] == OP_ADDR_ZERO) dsp_op_mode_y      <= DSP48E1_OPMODE_Z0_YC_X0;
+                    else                                         dsp_op_mode_y      <= DSP48E1_OPMODE_ZP17_YC_X0;
+                                                                 dsp_alu_mode_y     <= DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN;
+                                                                 dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_CARRYIN;
+                end
+                //
+            end                
             //
-            case (opcode)
+            UOP_OPCODE_MODULAR_SUBTRACT_X: begin
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES: if (rd_narrow_addr_y_dly[1] == OP_ADDR_ZERO) dsp_opmode_y <= DSP48E1_OPMODE_Z0_YC_X0;
-                                              else                                         dsp_opmode_y <= DSP48E1_OPMODE_ZP17_YC_X0;
+                if (rd_narrow_ena_x_dly2) begin
+                                                                 dsp_op_mode_x      <= DSP48E1_OPMODE_ZC_Y0_XAB;
+                                                                 dsp_alu_mode_x     <= DSP48E1_ALUMODE_Z_MINUS_X_AND_Y_AND_CIN;
+                    if (rd_narrow_addr_x_dly[1] == OP_ADDR_ZERO) dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_CARRYIN;
+                    else                                         dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_CARRYCASCOUT;
+                end
                 //
-            endcase
+                if (rd_narrow_ena_y_dly2) begin
+                                                                 dsp_op_mode_y      <= DSP48E1_OPMODE_ZC_Y0_XAB;
+                                                                 dsp_alu_mode_y     <= DSP48E1_ALUMODE_Z_MINUS_X_AND_Y_AND_CIN;
+                    if (rd_narrow_addr_y_dly[1] == OP_ADDR_ZERO) dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_CARRYIN;
+                    else                                         dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_CARRYCASCOUT;
+                end
+                //
+            end
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Y: begin
+                //
+                if (rd_narrow_ena_x_dly2) begin
+                                                                 dsp_op_mode_x      <= DSP48E1_OPMODE_ZC_Y0_XAB;
+                                                                 dsp_alu_mode_x     <= DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN;
+                    if (rd_narrow_addr_x_dly[1] == OP_ADDR_ZERO) dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_CARRYIN;
+                    else                                         dsp_carry_in_sel_x <= DSP48E1_CARRYINSEL_CARRYCASCOUT;
+                end
+                //
+                if (rd_narrow_ena_y_dly2) begin
+                                                                 dsp_op_mode_y      <= DSP48E1_OPMODE_ZC_Y0_XAB;
+                                                                 dsp_alu_mode_y     <= DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN;
+                    if (rd_narrow_addr_y_dly[1] == OP_ADDR_ZERO) dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_CARRYIN;
+                    else                                         dsp_carry_in_sel_y <= DSP48E1_CARRYINSEL_CARRYCASCOUT;
+                end
+                //
+            end
+            //
+        endcase
         //
     end
     
-    always @(posedge clk) {dsp_ce_x_dly, dsp_ce_y_dly} <= {dsp_ce_x, dsp_ce_y};
     
+    //
+    // DSP Feed Logic
+    //
+    always @(posedge clk) begin
+        //
+        dsp_x_x_x <= {DSP48E1_C_W{1'bX}};
+        dsp_x_x_y <= {DSP48E1_C_W{1'bX}};
+        dsp_y_x_x <= {DSP48E1_C_W{1'bX}};
+        dsp_y_x_y <= {DSP48E1_C_W{1'bX}};
+        dsp_x_y_x <= {DSP48E1_C_W{1'bX}};
+        dsp_x_y_y <= {DSP48E1_C_W{1'bX}};
+        dsp_y_y_x <= {DSP48E1_C_W{1'bX}};
+        dsp_y_y_y <= {DSP48E1_C_W{1'bX}};
+        //
+        case (opcode)
+            //
+            UOP_OPCODE_PROPAGATE_CARRIES: begin
+                //
+                if (rd_narrow_ena_x_dly2) begin
+                    dsp_x_x_y <= {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, wrk_rd_narrow_x_din_x[WORD_EXT_W-1:WORD_W], 1'b1, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+                    dsp_y_x_y <= {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, wrk_rd_narrow_y_din_x[WORD_EXT_W-1:WORD_W], 1'b1, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
+                end
+                //
+                if (rd_narrow_ena_y_dly2) begin
+                    dsp_x_y_y <= {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, wrk_rd_narrow_x_din_y[WORD_EXT_W-1:WORD_W], 1'b1, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+                    dsp_y_y_y <= {{(DSP48E1_C_W-(WORD_EXT_W+1)){1'b0}}, wrk_rd_narrow_y_din_y[WORD_EXT_W-1:WORD_W], 1'b1, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
+                end
+                //
+            end
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_X: begin
+                //
+                if (rd_narrow_ena_x_dly2) begin
+                    dsp_x_x_y <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+                    dsp_x_x_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
+                    dsp_y_x_y <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+                    dsp_y_x_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
+                end
+                //
+                if (rd_narrow_ena_y_dly2) begin
+                    dsp_x_y_y <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+                    dsp_x_y_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
+                    dsp_y_y_y <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+                    dsp_y_y_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
+                end
+                //
+            end
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Y: begin
+                //
+                if (rd_narrow_ena_x_dly2) begin
+                    dsp_x_x_y <= {{(DSP48E1_C_W-WORD_W){1'b1}}, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+                    dsp_x_x_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_wide_x_din_x[WORD_W-1:0]};
+                    dsp_y_x_y <= {{(DSP48E1_C_W-WORD_W){1'b1}}, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+                    dsp_y_x_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_wide_y_din_x[WORD_W-1:0]};
+                end
+                //
+                if (rd_narrow_ena_y_dly2) begin
+                    dsp_x_y_y <= {{(DSP48E1_C_W-WORD_W){1'b1}}, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+                    dsp_x_y_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_wide_x_din_y[WORD_W-1:0]};
+                    dsp_y_y_y <= {{(DSP48E1_C_W-WORD_W){1'b1}}, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+                    dsp_y_y_x <= {{(DSP48E1_C_W-WORD_W){1'b0}}, wrk_rd_wide_y_din_y[WORD_W-1:0]};
+                end
+                //
+            end
+            //
+        endcase
+        //
+    end
+    
+     
+    //
+    // DSP Slices
+    //   
     `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_x_x
     (
-        .clk        (clk),
-        .ce_abc     (dsp_ce_x),
-        .ce_p       (dsp_ce_x_dly),
-        .ce_opmode  (dsp_ce_x),
-        .x          (dsp_x_x_x),
-        .y          (dsp_x_x_y),
-        .p          (dsp_x_x_p),
-        .opmode     (dsp_opmode_x),
-        .casc_p_in  (),
-        .casc_p_out ()
+        .clk            (clk),
+        .ce_abc         (dsp_ce_x),
+        .ce_p           (dsp_ce_x_dly),
+        .ce_ctrl        (dsp_ce_x),
+        .x              (dsp_x_x_x),
+        .y              (dsp_x_x_y),
+        .p              (dsp_x_x_p),
+        .op_mode        (dsp_op_mode_x),
+        .alu_mode       (dsp_alu_mode_x),
+        .carry_in_sel   (dsp_carry_in_sel_x),
+        .casc_p_in      (),
+        .casc_p_out     (),
+        .carryout       (dsp_carry_out_x)
     );
     
     `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_y_x
     (
-        .clk        (clk),
-        .ce_abc     (dsp_ce_x),
-        .ce_p       (dsp_ce_x_dly),
-        .ce_opmode  (dsp_ce_x),
-        .x          (dsp_y_x_x),
-        .y          (dsp_y_x_y),
-        .p          (dsp_y_x_p),
-        .opmode     (dsp_opmode_x),
-        .casc_p_in  (),
-        .casc_p_out ()
+        .clk            (clk),
+        .ce_abc         (dsp_ce_x),
+        .ce_p           (dsp_ce_x_dly),
+        .ce_ctrl        (dsp_ce_x),
+        .x              (dsp_y_x_x),
+        .y              (dsp_y_x_y),
+        .p              (dsp_y_x_p),
+        .op_mode        (dsp_op_mode_x),
+        .alu_mode       (dsp_alu_mode_x),
+        .carry_in_sel   (dsp_carry_in_sel_x),
+        .casc_p_in      (),
+        .casc_p_out     (),
+        .carryout       ()
     );
     
     `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_x_y
     (
-        .clk        (clk),
-        .ce_abc     (dsp_ce_y),
-        .ce_p       (dsp_ce_y_dly),
-        .ce_opmode  (dsp_ce_y),
-        .x          (dsp_x_y_x),
-        .y          (dsp_x_y_y),
-        .p          (dsp_x_y_p),
-        .opmode     (dsp_opmode_y),
-        .casc_p_in  (),
-        .casc_p_out ()
+        .clk            (clk),
+        .ce_abc         (dsp_ce_y),
+        .ce_p           (dsp_ce_y_dly),
+        .ce_ctrl        (dsp_ce_y),
+        .x              (dsp_x_y_x),
+        .y              (dsp_x_y_y),
+        .p              (dsp_x_y_p),
+        .op_mode        (dsp_op_mode_y),
+        .alu_mode       (dsp_alu_mode_y),        
+        .carry_in_sel   (dsp_carry_in_sel_y),
+        .casc_p_in      (),
+        .casc_p_out     (),
+        .carryout       (dsp_carry_out_y)
     );
     
     `MODEXPNG_DSP_SLICE_ADDSUB dst_inst_y_y
     (
-        .clk        (clk),
-        .ce_abc     (dsp_ce_y),
-        .ce_p       (dsp_ce_y_dly),
-        .ce_opmode  (dsp_ce_y),
-        .x          (dsp_y_y_x),
-        .y          (dsp_y_y_y),
-        .p          (dsp_y_y_p),
-        .opmode     (dsp_opmode_y),
-        .casc_p_in  (),
-        .casc_p_out ()
+        .clk            (clk),
+        .ce_abc         (dsp_ce_y),
+        .ce_p           (dsp_ce_y_dly),
+        .ce_ctrl        (dsp_ce_y),
+        .x              (dsp_y_y_x),
+        .y              (dsp_y_y_y),
+        .p              (dsp_y_y_p),
+        .op_mode        (dsp_op_mode_y),
+        .alu_mode       (dsp_alu_mode_y),
+        .carry_in_sel   (dsp_carry_in_sel_y),
+        .casc_p_in      (),
+        .casc_p_out     (),
+        .carryout       ()
     );
     
     
     //
-    // UOP_OPCODE_PROPAGATE_CARRIES
-    //
-    reg [CARRY_W -1:0] propagate_carries_x_x_cry_r;
-    reg [CARRY_W -1:0] propagate_carries_y_x_cry_r;
-    reg [CARRY_W -1:0] propagate_carries_x_y_cry_r;
-    reg [CARRY_W -1:0] propagate_carries_y_y_cry_r;
-    
-    wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry = rd_narrow_x_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_x_cry_r};
-    wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry = rd_narrow_y_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_x_cry_r};
-    wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry = rd_narrow_x_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_y_cry_r};
-    wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry = rd_narrow_y_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_y_cry_r};
-    
-    reg [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_r;
-    reg [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_r;
-    reg [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_r;
-    reg [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_r;
-    
-    wire [CARRY_W -1:0] propagate_carries_x_x_w_cry_msb = propagate_carries_x_x_w_cry_r[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] propagate_carries_y_x_w_cry_msb = propagate_carries_y_x_w_cry_r[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] propagate_carries_x_y_w_cry_msb = propagate_carries_x_y_w_cry_r[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] propagate_carries_y_y_w_cry_msb = propagate_carries_y_y_w_cry_r[WORD_EXT_W -1:WORD_W];
-    
-    wire [WORD_W -1:0] propagate_carries_x_x_w_cry_lsb = propagate_carries_x_x_w_cry_r[WORD_W -1:0];
-    wire [WORD_W -1:0] propagate_carries_y_x_w_cry_lsb = propagate_carries_y_x_w_cry_r[WORD_W -1:0];
-    wire [WORD_W -1:0] propagate_carries_x_y_w_cry_lsb = propagate_carries_x_y_w_cry_r[WORD_W -1:0];
-    wire [WORD_W -1:0] propagate_carries_y_y_w_cry_lsb = propagate_carries_y_y_w_cry_r[WORD_W -1:0];
-    
-    wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_x_w_cry_lsb};
-    wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_x_w_cry_lsb};
-    wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_y_w_cry_lsb};
-    wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_y_w_cry_lsb};
-    
-    task _propagate_carries_update_cry;
-        input [CARRY_W-1:0]     x_x_cry,                     y_x_cry,                     x_y_cry,                     y_y_cry;
-        {   propagate_carries_x_x_cry_r, propagate_carries_y_x_cry_r, propagate_carries_x_y_cry_r, propagate_carries_y_y_cry_r} <=
-        {                       x_x_cry,                     y_x_cry,                     x_y_cry,                     y_y_cry};
-    endtask
-    
-    task propagate_carries_clear_cry; _propagate_carries_update_cry(                     CARRY_ZERO,                      CARRY_ZERO,                      CARRY_ZERO,                      CARRY_ZERO); endtask
-    task propagate_carries_store_cry; _propagate_carries_update_cry(propagate_carries_x_x_w_cry_msb, propagate_carries_y_x_w_cry_msb, propagate_carries_x_y_w_cry_msb, propagate_carries_y_y_w_cry_msb); endtask
-        
-    task _propagate_carries_update_sum_w_cry;
-        input [WORD_EXT_W-1:0] x_x_sum_w_cry,                 y_x_sum_w_cry,                 x_y_sum_w_cry,                 y_y_sum_w_cry;
-        {      propagate_carries_x_x_w_cry_r, propagate_carries_y_x_w_cry_r, propagate_carries_x_y_w_cry_r, propagate_carries_y_y_w_cry_r} <=
-        {                      x_x_sum_w_cry,                 y_x_sum_w_cry,                 x_y_sum_w_cry,                 y_y_sum_w_cry};
-    endtask
-    
-    task propagate_carries_store_sum_w_cry; _propagate_carries_update_sum_w_cry(propagate_carries_x_x_w_cry, propagate_carries_y_x_w_cry, propagate_carries_x_y_w_cry, propagate_carries_y_y_w_cry); endtask
+    // UOP_OPCODE_MODULAR_SUBTRACT_X
+    //    
+    reg modular_subtract_x_brw_flag;
+    reg modular_subtract_y_brw_flag;
 
+    //
+    // IMPORTANT: DSP48E1 turns out to have a very non-obvious feature: when doing _subtraction_,
+    //            the CARRYOUT[3] is _NOT_ equivalent to the borrow flag! See "CARRYOUT/CARRYCASCOUT"
+    //            section of Appendix A on pp. 55-56 of UG479 for more details.
+    //
     always @(posedge clk)
         //
-        if (opcode == UOP_OPCODE_PROPAGATE_CARRIES)
-            //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_LATENCY_PRE3:  propagate_carries_clear_cry;
-                WRK_FSM_STATE_BUSY1,
-                WRK_FSM_STATE_LATENCY_POST1: propagate_carries_store_cry;
-                //
-                WRK_FSM_STATE_LATENCY_PRE4,
-                WRK_FSM_STATE_BUSY2,
-                WRK_FSM_STATE_LATENCY_POST2: propagate_carries_store_sum_w_cry;
-                //
+        case (opcode)
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_LATENCY_POST4:
+                    //{modular_subtract_x_brw_flag, modular_subtract_y_brw_flag} <= {1'bX, 1'bZ};
+                    {modular_subtract_x_brw_flag, modular_subtract_y_brw_flag} <= {~dsp_carry_out_x, ~dsp_carry_out_y};
+                endcase
             endcase
+
     
+    //reg modular_subtract_x_brw_r;
+    //reg modular_subtract_y_brw_r;
     
-    //
-    // UOP_OPCODE_MODULAR_SUBTRACT_X
-    // UOP_OPCODE_MODULAR_SUBTRACT_Y
-    //
-    reg modular_subtract_x_brw_r;
-    reg modular_subtract_y_brw_r;
-    
-    reg modular_subtract_x_cry_r;
-    reg modular_subtract_y_cry_r;
+    //reg modular_subtract_x_cry_r;
+    //reg modular_subtract_y_cry_r;
 
-    wire [WORD_W:0] modular_subtract_x_w_brw = rd_narrow_x_din_x_dly1[WORD_W:0] - rd_narrow_y_din_x_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_x_brw_r};
-    wire [WORD_W:0] modular_subtract_y_w_brw = rd_narrow_x_din_y_dly1[WORD_W:0] - rd_narrow_y_din_y_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_y_brw_r};
+    //wire [WORD_W:0] modular_subtract_x_w_brw = rd_narrow_x_din_x_dly1[WORD_W:0] - rd_narrow_y_din_x_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_x_brw_r};
+    //wire [WORD_W:0] modular_subtract_y_w_brw = rd_narrow_x_din_y_dly1[WORD_W:0] - rd_narrow_y_din_y_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_y_brw_r};
 
-    wire [WORD_W:0] modular_subtract_x_w_cry = rd_narrow_x_din_x_dly1[WORD_W:0] + rd_wide_x_din_x_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_x_cry_r};
-    wire [WORD_W:0] modular_subtract_y_w_cry = rd_narrow_x_din_y_dly1[WORD_W:0] + rd_wide_x_din_y_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_y_brw_r};
+    //wire [WORD_W:0] modular_subtract_x_w_cry = rd_narrow_x_din_x_dly1[WORD_W:0] + rd_wide_x_din_x_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_x_cry_r};
+    //wire [WORD_W:0] modular_subtract_y_w_cry = rd_narrow_x_din_y_dly1[WORD_W:0] + rd_wide_x_din_y_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_y_cry_r};
 
-    reg [WORD_W:0] modular_subtract_x_w_brw_r;
-    reg [WORD_W:0] modular_subtract_y_w_brw_r;
+    //reg [WORD_W:0] modular_subtract_x_w_brw_r;
+    //reg [WORD_W:0] modular_subtract_y_w_brw_r;
 
-    reg [WORD_W:0] modular_subtract_x_w_cry_r;
-    reg [WORD_W:0] modular_subtract_y_w_cry_r;
+    //reg [WORD_W:0] modular_subtract_x_w_cry_r;
+    //reg [WORD_W:0] modular_subtract_y_w_cry_r;
     
-    wire modular_subtract_x_w_brw_msb = modular_subtract_x_w_brw_r[WORD_W];
-    wire modular_subtract_y_w_brw_msb = modular_subtract_y_w_brw_r[WORD_W];
+    //wire modular_subtract_x_w_brw_msb = modular_subtract_x_w_brw_r[WORD_W];
+    //wire modular_subtract_y_w_brw_msb = modular_subtract_y_w_brw_r[WORD_W];
 
-    wire modular_subtract_x_w_cry_msb = modular_subtract_x_w_cry_r[WORD_W];
-    wire modular_subtract_y_w_cry_msb = modular_subtract_y_w_cry_r[WORD_W];
+    //wire modular_subtract_x_w_cry_msb = modular_subtract_x_w_cry_r[WORD_W];
+    //wire modular_subtract_y_w_cry_msb = modular_subtract_y_w_cry_r[WORD_W];
     
-    wire [WORD_W -1:0] modular_subtract_x_w_brw_lsb = modular_subtract_x_w_brw_r[WORD_W -1:0];
-    wire [WORD_W -1:0] modular_subtract_y_w_brw_lsb = modular_subtract_y_w_brw_r[WORD_W -1:0];
+    //wire [WORD_W -1:0] modular_subtract_x_w_brw_lsb = modular_subtract_x_w_brw_r[WORD_W -1:0];
+    //wire [WORD_W -1:0] modular_subtract_y_w_brw_lsb = modular_subtract_y_w_brw_r[WORD_W -1:0];
 
-    wire [WORD_W -1:0] modular_subtract_x_w_cry_lsb = modular_subtract_x_w_cry_r[WORD_W -1:0];
-    wire [WORD_W -1:0] modular_subtract_y_w_cry_lsb = modular_subtract_y_w_cry_r[WORD_W -1:0];
+    //wire [WORD_W -1:0] modular_subtract_x_w_cry_lsb = modular_subtract_x_w_cry_r[WORD_W -1:0];
+    //wire [WORD_W -1:0] modular_subtract_y_w_cry_lsb = modular_subtract_y_w_cry_r[WORD_W -1:0];
 
-    wire [WORD_EXT_W -1:0] modular_subtract_x_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_brw_lsb};
-    wire [WORD_EXT_W -1:0] modular_subtract_y_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_brw_lsb};
+    //wire [WORD_EXT_W -1:0] modular_subtract_x_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_brw_lsb};
+    //wire [WORD_EXT_W -1:0] modular_subtract_y_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_brw_lsb};
 
-    wire [WORD_EXT_W -1:0] modular_subtract_x_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_cry_lsb};
-    wire [WORD_EXT_W -1:0] modular_subtract_y_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_cry_lsb};
+    //wire [WORD_EXT_W -1:0] modular_subtract_x_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_cry_lsb};
+    //wire [WORD_EXT_W -1:0] modular_subtract_y_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_cry_lsb};
     
     reg  [WORD_EXT_W -1:0] modular_subtract_x_mux;
     reg  [WORD_EXT_W -1:0] modular_subtract_y_mux;
@@ -1035,68 +1142,68 @@ module modexpng_general_worker
     wire [WORD_EXT_W -1:0] modular_subtract_x_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_mux[WORD_W-1:0]};
     wire [WORD_EXT_W -1:0] modular_subtract_y_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_mux[WORD_W-1:0]};
     
-    task _modular_subtract_update_brw;
-        input x_brw, y_brw;
-        {modular_subtract_x_brw_r, modular_subtract_y_brw_r} <= {x_brw, y_brw};
-    endtask
+    //task _modular_subtract_update_brw;
+        //input x_brw, y_brw;
+        //{modular_subtract_x_brw_r, modular_subtract_y_brw_r} <= {x_brw, y_brw};
+    //endtask
     
-    task _modular_subtract_update_cry;
-        input x_cry, y_cry;
-        {modular_subtract_x_cry_r, modular_subtract_y_cry_r} <= {x_cry, y_cry};
-    endtask
+    //task _modular_subtract_update_cry;
+        //input x_cry, y_cry;
+        //{modular_subtract_x_cry_r, modular_subtract_y_cry_r} <= {x_cry, y_cry};
+    //endtask
     
-    task modular_subtract_clear_brw; _modular_subtract_update_brw(                        1'b0,                         1'b0); endtask
-    task modular_subtract_store_brw; _modular_subtract_update_brw(modular_subtract_x_w_brw_msb, modular_subtract_y_w_brw_msb); endtask
+    //task modular_subtract_clear_brw; _modular_subtract_update_brw(                        1'b0,                         1'b0); endtask
+    //task modular_subtract_store_brw; _modular_subtract_update_brw(modular_subtract_x_w_brw_msb, modular_subtract_y_w_brw_msb); endtask
 
-    task modular_subtract_clear_cry; _modular_subtract_update_cry(                        1'b0,                         1'b0); endtask
-    task modular_subtract_store_cry; _modular_subtract_update_cry(modular_subtract_x_w_cry_msb, modular_subtract_y_w_cry_msb); endtask
+    //task modular_subtract_clear_cry; _modular_subtract_update_cry(                        1'b0,                         1'b0); endtask
+    //task modular_subtract_store_cry; _modular_subtract_update_cry(modular_subtract_x_w_cry_msb, modular_subtract_y_w_cry_msb); endtask
     
-    task _modular_subtract_update_diff_w_brw;
-        input [WORD_W:0] x_diff_w_brw, y_diff_w_brw;
-        {modular_subtract_x_w_brw_r, modular_subtract_y_w_brw_r} <= {x_diff_w_brw, y_diff_w_brw};
-    endtask
+    //task _modular_subtract_update_diff_w_brw;
+        //input [WORD_W:0] x_diff_w_brw, y_diff_w_brw;
+        //{modular_subtract_x_w_brw_r, modular_subtract_y_w_brw_r} <= {x_diff_w_brw, y_diff_w_brw};
+    //endtask
 
-    task _modular_subtract_update_sum_w_cry;
-        input [WORD_W:0] x_sum_w_cry, y_sum_w_cry;
-        {modular_subtract_x_w_cry_r, modular_subtract_y_w_cry_r} <= {x_sum_w_cry, y_sum_w_cry};
-    endtask
+    //task _modular_subtract_update_sum_w_cry;
+        //input [WORD_W:0] x_sum_w_cry, y_sum_w_cry;
+        //{modular_subtract_x_w_cry_r, modular_subtract_y_w_cry_r} <= {x_sum_w_cry, y_sum_w_cry};
+    //endtask
     
-    task modular_subtract_store_diff_w_brw; _modular_subtract_update_diff_w_brw(modular_subtract_x_w_brw, modular_subtract_y_w_brw); endtask
+    //task modular_subtract_store_diff_w_brw; _modular_subtract_update_diff_w_brw(modular_subtract_x_w_brw, modular_subtract_y_w_brw); endtask
 
-    task modular_subtract_store_sum_w_cry; _modular_subtract_update_sum_w_cry(modular_subtract_x_w_cry, modular_subtract_y_w_cry); endtask
+    //task modular_subtract_store_sum_w_cry; _modular_subtract_update_sum_w_cry(modular_subtract_x_w_cry, modular_subtract_y_w_cry); endtask
     
     always @(posedge clk)
         //
         case (opcode)
             //
-            UOP_OPCODE_MODULAR_SUBTRACT_X:
+            //UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
-                case (wrk_fsm_state)
+                //case (wrk_fsm_state)
                     //
-                    WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_brw;
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST3: modular_subtract_store_brw; // we need the very last borrow here too!
+                    //WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_brw;
+                    //WRK_FSM_STATE_BUSY1,
+                    //WRK_FSM_STATE_LATENCY_POST1,
+                    //WRK_FSM_STATE_LATENCY_POST3: modular_subtract_store_brw; // we need the very last borrow here too!
                     //
-                    WRK_FSM_STATE_LATENCY_PRE4,
-                    WRK_FSM_STATE_BUSY2,
-                    WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_diff_w_brw;
+                    //WRK_FSM_STATE_LATENCY_PRE4,
+                    //WRK_FSM_STATE_BUSY2,
+                    //WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_diff_w_brw;
                     //
-                endcase
+                //endcase
             //
-            UOP_OPCODE_MODULAR_SUBTRACT_Y:
+            //UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
-                case (wrk_fsm_state)
+                //case (wrk_fsm_state)
                     //
-                    WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_cry;
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1: modular_subtract_store_cry;
+                    //WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_cry;
+                    //WRK_FSM_STATE_BUSY1,
+                    //WRK_FSM_STATE_LATENCY_POST1: modular_subtract_store_cry;
                     //
-                    WRK_FSM_STATE_LATENCY_PRE4,
-                    WRK_FSM_STATE_BUSY2,
-                    WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_sum_w_cry;
+                    //WRK_FSM_STATE_LATENCY_PRE4,
+                    //WRK_FSM_STATE_BUSY2,
+                    //WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_sum_w_cry;
                     //
-                endcase
+                //endcase
             //
             UOP_OPCODE_MODULAR_SUBTRACT_Z:
                 //
@@ -1106,8 +1213,8 @@ module modexpng_general_worker
                     WRK_FSM_STATE_BUSY2,
                     WRK_FSM_STATE_LATENCY_POST2:
                         //
-                        begin modular_subtract_x_mux <= !modular_subtract_x_brw_r ? rd_narrow_x_din_x_dly1 : rd_wide_x_din_x_dly1;
-                              modular_subtract_y_mux <= !modular_subtract_y_brw_r ? rd_narrow_x_din_y_dly1 : rd_wide_x_din_y_dly1; end
+                        begin modular_subtract_x_mux <= !modular_subtract_x_brw_flag ? rd_narrow_x_din_x_dly1 : rd_wide_x_din_x_dly1;
+                              modular_subtract_y_mux <= !modular_subtract_y_brw_flag ? rd_narrow_x_din_y_dly1 : rd_wide_x_din_y_dly1; end
                     //
                 endcase            
             //
@@ -1316,7 +1423,8 @@ module modexpng_general_worker
         //
         case (opcode)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES:
+            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
                 case (wrk_fsm_state)
                     //
@@ -1324,7 +1432,6 @@ module modexpng_general_worker
                     WRK_FSM_STATE_LATENCY_POST2,
                     WRK_FSM_STATE_LATENCY_POST4:
                         //
-                        //update_narrow_dout(propagate_carries_x_x_w_cry_reduced, propagate_carries_y_x_w_cry_reduced, propagate_carries_x_y_w_cry_reduced, propagate_carries_y_y_w_cry_reduced);
                         update_narrow_dout(dsp_x_x_p_reduced, dsp_y_x_p_reduced, dsp_x_y_p_reduced, dsp_y_y_p_reduced);
                     // 
                 endcase
@@ -1380,27 +1487,15 @@ module modexpng_general_worker
                   //                    
                 endcase
             //
-            UOP_OPCODE_MODULAR_SUBTRACT_X:
-                //
-                case (wrk_fsm_state)
-                    //
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST3:
-                        //
-                        update_narrow_dout(modular_subtract_x_w_brw_reduced, modular_subtract_x_w_brw_reduced, modular_subtract_y_w_brw_reduced, modular_subtract_y_w_brw_reduced);
-                    //
-                endcase
-            //
             UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
                 case (wrk_fsm_state)
                     //
-                    WRK_FSM_STATE_BUSY1,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST3:
-                        //
-                        update_wide_dout(modular_subtract_x_w_cry_reduced, modular_subtract_x_w_cry_reduced, modular_subtract_y_w_cry_reduced, modular_subtract_y_w_cry_reduced);
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2,
+                    WRK_FSM_STATE_LATENCY_POST4:
+                        // 
+                        update_wide_dout(dsp_x_x_p_reduced, dsp_y_x_p_reduced, dsp_x_y_p_reduced, dsp_y_y_p_reduced);
                     //
                 endcase                
             //



More information about the Commits mailing list