[Cryptech-Commits] [user/shatov/modexpng] 01/21: Had to rework the general worker module to reach 180 MHz core clock. The module is responsible for doing certain supporting operations (mostly moving operands between banks and doing some simple math operations, such as modular subtraction and regular addition). Depending on the particular operation, one of three bank address space sweep patterns was used: * one-pass (for things like carry propagation) * two-pass (for things like modular subtraction that produce intermediate values in [...]

git at cryptech.is git at cryptech.is
Mon Jan 20 21:18:02 UTC 2020


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 83f8779a661202183f5866a4e80ef36f24b9e1ea
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jan 16 14:45:26 2020 +0300

    Had to rework the general worker module to reach 180 MHz core clock. The module
    is responsible for doing certain supporting operations (mostly moving operands
    between banks and doing some simple math operations, such as modular
    subtraction and regular addition). Depending on the particular operation, one
    of three bank address space sweep patterns was used:
     * one-pass (for things like carry propagation)
     * two-pass (for things like modular subtraction that produce intermediate
       values in the process)
     * one-pass interleaved (for copying when only either CRT_?.X or CRT_?.Y is
       rewritten: we can only write to X and Y simultaneously, so we have to
       interleave reads from the source bank with reads from the destination bank
       and overwrite the destination with its just read value, otherwise the second
       destination operand is lost)
    I initially coded three FSMs, one for each of the address space sweeps and
    triggered one of them depending on the opcode, but that turned out too
    complicated. There's now only one FSM that always does the "one-pass
    interleaved" pattern, whereas the second read (from the destination bank) is
    inhibited when not need by the opcode.
---
 rtl/modexpng_general_worker.v | 1898 ++++++++++++++++++-----------------------
 1 file changed, 839 insertions(+), 1059 deletions(-)

diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v
index eadd284..0620bd6 100644
--- a/rtl/modexpng_general_worker.v
+++ b/rtl/modexpng_general_worker.v
@@ -127,67 +127,46 @@ module modexpng_general_worker
     //
     // FSM Declaration
     //
-    localparam [5:0] WRK_FSM_STATE_IDLE             = 6'h00;
-    
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1     = 6'h01;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2     = 6'h02;
-    localparam [5:0] WRK_FSM_STATE_BUSY             = 6'h03;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1    = 6'h05;    // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug!
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2    = 6'h06;
-    
-    localparam [5:0] WRK_FSM_STATE_STOP             = 6'h07;
-    
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M1  = 6'h10;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M2  = 6'h11;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M1  = 6'h12;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M2  = 6'h13;    
-    localparam [5:0] WRK_FSM_STATE_BUSY_M1          = 6'h14;
-    localparam [5:0] WRK_FSM_STATE_BUSY_M2          = 6'h15;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 6'h16;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 6'h17;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 6'h18;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 6'h19;
-
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_TP  = 6'h20;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_TP  = 6'h21;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE3_TP  = 6'h22;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE4_TP  = 6'h23;
-    localparam [5:0] WRK_FSM_STATE_BUSY_TP          = 6'h24;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_TP = 6'h25;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_TP = 6'h26;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST3_TP = 6'h27;
-    localparam [5:0] WRK_FSM_STATE_LATENCY_POST4_TP = 6'h28;
-    localparam [5:0] WRK_FSM_STATE_HOLDOFF_TP       = 6'h29;
-    
-    reg [5:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
-    reg [5:0] wrk_fsm_state_next_one_pass;         // single address space sweep
-    reg [5:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y)
-    reg [5:0] wrk_fsm_state_next_two_pass;         // two address space sweeps
-    reg       wrk_fsm_two_pass_pass;               // 0=first pass, 1=second pass
-    reg       wrk_fsm_two_pass_pass_dly;           // 0=first pass, 1=second pass
-
-
-    // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps)
+
+    localparam [3:0] WRK_FSM_STATE_IDLE          = 4'h0;
+    
+    localparam [3:0] WRK_FSM_STATE_LATENCY_PRE1  = 4'h1;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_PRE2  = 4'h2;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_PRE3  = 4'h3;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_PRE4  = 4'h4;
+    
+    localparam [3:0] WRK_FSM_STATE_BUSY1         = 4'hA;
+    localparam [3:0] WRK_FSM_STATE_BUSY2         = 4'hB;
+    
+    localparam [3:0] WRK_FSM_STATE_LATENCY_POST1 = 4'h5;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_POST2 = 4'h6;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_POST3 = 4'h7;
+    localparam [3:0] WRK_FSM_STATE_LATENCY_POST4 = 4'h8;
     
+    localparam [3:0] WRK_FSM_STATE_STOP          = 4'hF;
+
+    reg [3:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
+    reg [3:0] wrk_fsm_state_next;
+
 
     //
     // Control Signals
     //
-    reg                    rd_wide_xy_ena_x = 1'b0;
-    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_x;
-    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_x; 
+    reg                    rd_wide_ena_x = 1'b0;
+    reg [BANK_ADDR_W -1:0] rd_wide_bank_x;
+    reg [  OP_ADDR_W -1:0] rd_wide_addr_x; 
 
-    reg                    rd_narrow_xy_ena_x = 1'b0;
-    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_x;
-    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_x; 
+    reg                    rd_narrow_ena_x = 1'b0;
+    reg [BANK_ADDR_W -1:0] rd_narrow_bank_x;
+    reg [  OP_ADDR_W -1:0] rd_narrow_addr_x; 
 
-    reg                    rd_wide_xy_ena_y = 1'b0;
-    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_y;
-    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_y; 
+    reg                    rd_wide_ena_y = 1'b0;
+    reg [BANK_ADDR_W -1:0] rd_wide_bank_y;
+    reg [  OP_ADDR_W -1:0] rd_wide_addr_y; 
 
-    reg                    rd_narrow_xy_ena_y = 1'b0;
-    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_y;
-    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_y; 
+    reg                    rd_narrow_ena_y = 1'b0;
+    reg [BANK_ADDR_W -1:0] rd_narrow_bank_y;
+    reg [  OP_ADDR_W -1:0] rd_narrow_addr_y; 
     
     reg                    wr_wide_xy_ena_x = 1'b0;
     reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_x;
@@ -217,21 +196,21 @@ module modexpng_general_worker
     //
     // Mapping
     //
-    assign wrk_rd_wide_xy_ena_x     = rd_wide_xy_ena_x;
-    assign wrk_rd_wide_xy_bank_x    = rd_wide_xy_bank_x;
-    assign wrk_rd_wide_xy_addr_x    = rd_wide_xy_addr_x;
+    assign wrk_rd_wide_xy_ena_x     = rd_wide_ena_x;
+    assign wrk_rd_wide_xy_bank_x    = rd_wide_bank_x;
+    assign wrk_rd_wide_xy_addr_x    = rd_wide_addr_x;
 
-    assign wrk_rd_narrow_xy_ena_x   = rd_narrow_xy_ena_x;
-    assign wrk_rd_narrow_xy_bank_x  = rd_narrow_xy_bank_x;
-    assign wrk_rd_narrow_xy_addr_x  = rd_narrow_xy_addr_x;
+    assign wrk_rd_narrow_xy_ena_x   = rd_narrow_ena_x;
+    assign wrk_rd_narrow_xy_bank_x  = rd_narrow_bank_x;
+    assign wrk_rd_narrow_xy_addr_x  = rd_narrow_addr_x;
     
-    assign wrk_rd_wide_xy_ena_y     = rd_wide_xy_ena_y;
-    assign wrk_rd_wide_xy_bank_y    = rd_wide_xy_bank_y;
-    assign wrk_rd_wide_xy_addr_y    = rd_wide_xy_addr_y;
+    assign wrk_rd_wide_xy_ena_y     = rd_wide_ena_y;
+    assign wrk_rd_wide_xy_bank_y    = rd_wide_bank_y;
+    assign wrk_rd_wide_xy_addr_y    = rd_wide_addr_y;
 
-    assign wrk_rd_narrow_xy_ena_y   = rd_narrow_xy_ena_y;
-    assign wrk_rd_narrow_xy_bank_y  = rd_narrow_xy_bank_y;
-    assign wrk_rd_narrow_xy_addr_y  = rd_narrow_xy_addr_y;
+    assign wrk_rd_narrow_xy_ena_y   = rd_narrow_ena_y;
+    assign wrk_rd_narrow_xy_bank_y  = rd_narrow_bank_y;
+    assign wrk_rd_narrow_xy_addr_y  = rd_narrow_addr_y;
 
     assign wrk_wr_wide_xy_ena_x     = wr_wide_xy_ena_x;
     assign wrk_wr_wide_xy_bank_x    = wr_wide_xy_bank_x;
@@ -260,172 +239,111 @@ module modexpng_general_worker
    
     //
     // Delays
-    //    
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4;
-
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4;
-    
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2;
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3;
-    //reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly4;
-    
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2;
-    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3;
-    //reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly4;
-    
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly2;
-    
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly1;
-    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly2;
+    //
+    reg [OP_ADDR_W -1:0] rd_narrow_addr_x_dly[0:3];
+    reg [OP_ADDR_W -1:0] rd_narrow_addr_y_dly[0:3];
+
+    reg [OP_ADDR_W -1:0] rd_wide_addr_x_dly[0:3];
+    reg [OP_ADDR_W -1:0] rd_wide_addr_y_dly[0:3];
+    
+    reg [WORD_EXT_W -1:0] rd_wide_x_din_x_dly1;
+    reg [WORD_EXT_W -1:0] rd_wide_y_din_x_dly1;
+    reg [WORD_EXT_W -1:0] rd_wide_x_din_y_dly1;
+    reg [WORD_EXT_W -1:0] rd_wide_y_din_y_dly1;    
+    reg [WORD_EXT_W -1:0] rd_narrow_x_din_x_dly1;
+    reg [WORD_EXT_W -1:0] rd_narrow_y_din_x_dly1;
+    reg [WORD_EXT_W -1:0] rd_narrow_x_din_y_dly1;
+    reg [WORD_EXT_W -1:0] rd_narrow_y_din_y_dly1;
     
     always @(posedge clk) begin
         //
-        {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x};
-        {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y};        
+        {rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x};
+        {rd_wide_y_din_x_dly1} <= {wrk_rd_wide_y_din_x};
+        {rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y};
+        {rd_wide_y_din_y_dly1} <= {wrk_rd_wide_y_din_y};
         //
-        {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x};
-        {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y};
+        {rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x};
+        {rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x};
+        {rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y};
+        {rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y};
         //
-        {/*wrk_rd_wide_x_din_x_dly4,*/ wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {/*wrk_rd_wide_x_din_x_dly3,*/ wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x};
-        {/*wrk_rd_wide_x_din_y_dly4,*/ wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {/*wrk_rd_wide_x_din_y_dly3,*/ wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y};
+        {rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0]} <= {rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0], rd_narrow_addr_x};
+        {rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0]} <= {rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0], rd_narrow_addr_y};
         //
-        {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x};
-        {wrk_rd_narrow_y_din_x_dly2, wrk_rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x_dly1, wrk_rd_narrow_y_din_x};
-        {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y};
-        {wrk_rd_narrow_y_din_y_dly2, wrk_rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y_dly1, wrk_rd_narrow_y_din_y};
+        {rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0]} <= {rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0], rd_wide_addr_x};
+        {rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0]} <= {rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0], rd_wide_addr_y};
         //
     end
-        
-
+    
+  
     //
     // Source Read Enable Logic
     //
+    task _update_wide_rd_en;   input _en; {rd_wide_ena_x,   rd_wide_ena_y  } <= {2{_en}}; endtask
+    task _update_narrow_rd_en; input _en; {rd_narrow_ena_x, rd_narrow_ena_y} <= {2{_en}}; endtask
     
-    task _update_wide_xy_rd_en;   input _en; {rd_wide_xy_ena_x,   rd_wide_xy_ena_y  } <= {2{_en}}; endtask
-    task _update_narrow_xy_rd_en; input _en; {rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{_en}}; endtask
+    task enable_wide_rd_en;  _update_wide_rd_en(1'b1); endtask
+    task disable_wide_rd_en; _update_wide_rd_en(1'b0); endtask
     
-    task enable_wide_xy_rd_en;  _update_wide_xy_rd_en(1'b1); endtask
-    task disable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b0); endtask
-    
-    task enable_narrow_xy_rd_en;  _update_narrow_xy_rd_en(1'b1); endtask
-    task disable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b0); endtask
+    task enable_narrow_rd_en;  _update_narrow_rd_en(1'b1); endtask
+    task disable_narrow_rd_en; _update_narrow_rd_en(1'b0); endtask
     
     always @(posedge clk or negedge rst_n)
         //
         if (!rst_n) begin
             //
-            disable_wide_xy_rd_en;
-            disable_narrow_xy_rd_en;
+            disable_wide_rd_en;
+            disable_narrow_rd_en;
             //
         end else begin
             //
-            disable_wide_xy_rd_en;
-            disable_narrow_xy_rd_en;
-            //
-            // one_pass
+            disable_wide_rd_en;
+            disable_narrow_rd_en;
             //
-            case (wrk_fsm_state_next_one_pass)
+            case (opcode)
                 //
-                WRK_FSM_STATE_LATENCY_PRE1,
-                WRK_FSM_STATE_LATENCY_PRE2,
-                WRK_FSM_STATE_BUSY:
+                UOP_OPCODE_PROPAGATE_CARRIES,
+                UOP_OPCODE_OUTPUT_FROM_NARROW,
+                UOP_OPCODE_MODULAR_REDUCE_INIT,
+                UOP_OPCODE_MODULAR_SUBTRACT_X:
                     //
-                    case (opcode)
-                        //
-                        UOP_OPCODE_PROPAGATE_CARRIES,
-                        UOP_OPCODE_OUTPUT_FROM_NARROW,
-                        UOP_OPCODE_MODULAR_REDUCE_INIT:
-                            //
-                            enable_narrow_xy_rd_en;
-                            //
-                        UOP_OPCODE_COPY_CRT_Y2X: begin
-                            //
-                            enable_wide_xy_rd_en;
-                            enable_narrow_xy_rd_en;                            
-                            //
-                        end
-                        //
-                        UOP_OPCODE_MERGE_LH:
-                            //
-                            enable_wide_xy_rd_en;
-                            //
+                    case (wrk_fsm_state_next)
+                        WRK_FSM_STATE_LATENCY_PRE1,
+                        WRK_FSM_STATE_LATENCY_PRE3,
+                        WRK_FSM_STATE_BUSY1: enable_narrow_rd_en;
                     endcase
                 //
-            endcase
-            //
-            // one_pass_meander
-            //
-            case (wrk_fsm_state_next_one_pass_meander)
-                //
-                WRK_FSM_STATE_LATENCY_PRE1_M1,
-                WRK_FSM_STATE_LATENCY_PRE1_M2,
-                WRK_FSM_STATE_LATENCY_PRE2_M1,
-                WRK_FSM_STATE_LATENCY_PRE2_M2,
-                WRK_FSM_STATE_BUSY_M1,
-                WRK_FSM_STATE_BUSY_M2:
+                UOP_OPCODE_COPY_CRT_Y2X,
+                UOP_OPCODE_MODULAR_SUBTRACT_Y,
+                UOP_OPCODE_MODULAR_SUBTRACT_Z,
+                UOP_OPCODE_REGULAR_ADD_UNEVEN:
                     //
-                    case (opcode)
-                        //
-                        UOP_OPCODE_COPY_LADDERS_X2Y,
-                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                            //
-                            enable_wide_xy_rd_en;
-                            enable_narrow_xy_rd_en;
-                            //
-                        end
-                        //
-                        UOP_OPCODE_REGULAR_ADD_UNEVEN:
-                            //
-                            enable_narrow_xy_rd_en;
-                        //
+                    case (wrk_fsm_state_next)
+                        WRK_FSM_STATE_LATENCY_PRE1,
+                        WRK_FSM_STATE_LATENCY_PRE3,
+                        WRK_FSM_STATE_BUSY1: begin enable_wide_rd_en; enable_narrow_rd_en;  end
                     endcase
                 //
-            endcase
-            //
-            // two_pass
-            //
-            case (wrk_fsm_state_next_two_pass)
-                //
-                WRK_FSM_STATE_LATENCY_PRE1_TP,
-                WRK_FSM_STATE_LATENCY_PRE2_TP,
-                WRK_FSM_STATE_LATENCY_PRE3_TP,
-                WRK_FSM_STATE_LATENCY_PRE4_TP,
-                WRK_FSM_STATE_BUSY_TP:
+                UOP_OPCODE_COPY_LADDERS_X2Y,
+                UOP_OPCODE_CROSS_LADDERS_X2Y:
                     //
-                    case (opcode)
-                        UOP_OPCODE_MODULAR_SUBTRACT:
-                            //
-                            if (!wrk_fsm_two_pass_pass) begin
-                                enable_wide_xy_rd_en;
-                                enable_narrow_xy_rd_en;
-                            end else
-                                enable_narrow_xy_rd_en;
-                        //
+                    case (wrk_fsm_state_next)
+                        WRK_FSM_STATE_LATENCY_PRE1,
+                        WRK_FSM_STATE_LATENCY_PRE2,
+                        WRK_FSM_STATE_LATENCY_PRE3,
+                        WRK_FSM_STATE_LATENCY_PRE4,
+                        WRK_FSM_STATE_BUSY1,
+                        WRK_FSM_STATE_BUSY2: begin enable_wide_rd_en; enable_narrow_rd_en;  end
                     endcase
                 //
+                UOP_OPCODE_MERGE_LH:
+                    //
+                    case (wrk_fsm_state_next)
+                        WRK_FSM_STATE_LATENCY_PRE1,
+                        WRK_FSM_STATE_LATENCY_PRE3,
+                        WRK_FSM_STATE_BUSY1: enable_wide_rd_en;
+                    endcase                
+                //
             endcase
             //
         end
@@ -435,490 +353,330 @@ module modexpng_general_worker
     // Destination Write Enable Logic
     //
     
-    task _update_wide_xy_wr_en;   input _en; {wr_wide_xy_ena_x,   wr_wide_xy_ena_y  } <= {2{_en}}; endtask
-    task _update_narrow_xy_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask
+    task _update_wide_wr_en;   input _en; {wr_wide_xy_ena_x,   wr_wide_xy_ena_y  } <= {2{_en}}; endtask
+    task _update_narrow_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask
     
-    task enable_wide_xy_wr_en;  _update_wide_xy_wr_en(1'b1); endtask
-    task disable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b0); endtask
+    task enable_wide_wr_en;  _update_wide_wr_en(1'b1); endtask
+    task disable_wide_wr_en; _update_wide_wr_en(1'b0); endtask
     
-    task enable_narrow_xy_wr_en;  _update_narrow_xy_wr_en(1'b1); endtask
-    task disable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b0); endtask
+    task enable_narrow_wr_en;  _update_narrow_wr_en(1'b1); endtask
+    task disable_narrow_wr_en; _update_narrow_wr_en(1'b0); endtask
     
     always @(posedge clk or negedge rst_n)
         //
         if (!rst_n) begin
             //
-            disable_wide_xy_wr_en;
-            disable_narrow_xy_wr_en;
+            disable_wide_wr_en;
+            disable_narrow_wr_en;
             //
         end else begin
             //
-            disable_wide_xy_wr_en;
-            disable_narrow_xy_wr_en;
+            disable_wide_wr_en;
+            disable_narrow_wr_en;
             //
-            // one_pass
-            //
-            case (wrk_fsm_state)
+            case (opcode)
                 //
-                WRK_FSM_STATE_BUSY,
-                WRK_FSM_STATE_LATENCY_POST1,
-                WRK_FSM_STATE_LATENCY_POST2:
+                UOP_OPCODE_PROPAGATE_CARRIES,
+                UOP_OPCODE_MODULAR_SUBTRACT_X,
+                UOP_OPCODE_MERGE_LH,
+                UOP_OPCODE_REGULAR_ADD_UNEVEN:
                     //
-                    case (opcode)
-                        //
-                        UOP_OPCODE_PROPAGATE_CARRIES,
-                        UOP_OPCODE_MERGE_LH:
-                            //
-                            enable_narrow_xy_wr_en;
-                            //
-                        UOP_OPCODE_COPY_CRT_Y2X: begin
-                            //
-                            enable_wide_xy_wr_en;
-                            enable_narrow_xy_wr_en;                            
-                            //
-                        end
-                        //
-                        UOP_OPCODE_MODULAR_REDUCE_INIT:
-                            //
-                            enable_wide_xy_wr_en;
-                        //
+                    case (wrk_fsm_state)
+                        WRK_FSM_STATE_BUSY1,
+                        WRK_FSM_STATE_LATENCY_POST1,
+                        WRK_FSM_STATE_LATENCY_POST3: enable_narrow_wr_en;
                     endcase
                 //
-            endcase
-            //
-            // one_pass_meander
-            //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_BUSY_M2,
-                WRK_FSM_STATE_LATENCY_POST1_M2,
-                WRK_FSM_STATE_LATENCY_POST2_M2:
+                UOP_OPCODE_COPY_CRT_Y2X,
+                UOP_OPCODE_COPY_LADDERS_X2Y,
+                UOP_OPCODE_CROSS_LADDERS_X2Y,
+                UOP_OPCODE_MODULAR_SUBTRACT_Z:
                     //
-                    case (opcode)
-                        //
-                        UOP_OPCODE_COPY_LADDERS_X2Y,
-                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                            //
-                            enable_wide_xy_wr_en;
-                            enable_narrow_xy_wr_en;
-                            //
-                        end
-                        //
-                        UOP_OPCODE_REGULAR_ADD_UNEVEN:
-                            //
-                            enable_narrow_xy_wr_en;
-                        //
+                    case (wrk_fsm_state)
+                        WRK_FSM_STATE_BUSY1,
+                        WRK_FSM_STATE_LATENCY_POST1,
+                        WRK_FSM_STATE_LATENCY_POST3: begin enable_wide_wr_en; enable_narrow_wr_en; end
                     endcase
                 //
-            endcase
-            //
-            // two_pass
-            //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_BUSY_TP,
-                WRK_FSM_STATE_LATENCY_POST1_TP,
-                WRK_FSM_STATE_LATENCY_POST2_TP,
-                WRK_FSM_STATE_LATENCY_POST3_TP,
-                WRK_FSM_STATE_LATENCY_POST4_TP:
+                UOP_OPCODE_MODULAR_REDUCE_INIT,
+                UOP_OPCODE_MODULAR_SUBTRACT_Y:
                     //
-                    case (opcode)
-                        //
-                        UOP_OPCODE_MODULAR_SUBTRACT:
-                            //
-                            if (!wrk_fsm_two_pass_pass)
-                                enable_narrow_xy_wr_en;
-                            else begin
-                                enable_wide_xy_wr_en;
-                                enable_narrow_xy_wr_en;
-                            end
-                        //
+                    case (wrk_fsm_state)
+                        WRK_FSM_STATE_BUSY1,
+                        WRK_FSM_STATE_LATENCY_POST1,
+                        WRK_FSM_STATE_LATENCY_POST3: enable_wide_wr_en;
                     endcase
-                //
+                //  
             endcase
             //
         end
-
+   
 
     //
-    // Source to Destination Data Logic
+    // Source Read Address Logic
     //
+    reg [OP_ADDR_W -1:0] rd_wide_addr_next;
+    reg [OP_ADDR_W -1:0] rd_narrow_addr_next;
+
+    reg rd_wide_addr_is_last = 1'b0;
+    reg rd_narrow_addr_is_last = 1'b0;
+    
+    reg rd_wide_addr_is_last_half = 1'b0;
+    reg rd_narrow_addr_is_last_half = 1'b0;
+
+    reg rd_wide_addr_next_is_last = 1'b0;
+    reg rd_narrow_addr_next_is_last = 1'b0;
     
+    reg rd_wide_addr_next_is_last_half = 1'b0;
+    reg rd_narrow_addr_next_is_last_half = 1'b0;
+    
+    reg [3:0] rd_wide_addr_is_last_half_dly = 4'h0;
+    reg [3:0] rd_narrow_addr_is_last_half_dly = 4'h0;
+
     always @(posedge clk) begin
         //
-        update_wide_dout  (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
-        update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
-        //
-        // one_pass
-        //
-        case (wrk_fsm_state)
-            //
-            WRK_FSM_STATE_BUSY,
-            WRK_FSM_STATE_LATENCY_POST1,
-            WRK_FSM_STATE_LATENCY_POST2:
-                //
-                case (opcode)
-                    //
-                    UOP_OPCODE_PROPAGATE_CARRIES:
-                        //
-                        update_narrow_dout(rd_narrow_x_din_x_w_cry_reduced,
-                                           rd_narrow_y_din_x_w_cry_reduced,
-                                           rd_narrow_x_din_y_w_cry_reduced,
-                                           rd_narrow_y_din_y_w_cry_reduced);
-                    //
-                    UOP_OPCODE_COPY_CRT_Y2X: begin
-                        //
-                        update_wide_dout(wrk_rd_wide_x_din_y,
-                                         wrk_rd_wide_y_din_y,
-                                         wrk_rd_wide_x_din_y,
-                                         wrk_rd_wide_y_din_y);
-                        //
-                        update_narrow_dout(wrk_rd_narrow_x_din_y,
-                                           wrk_rd_narrow_y_din_y,
-                                           wrk_rd_narrow_x_din_y,
-                                           wrk_rd_narrow_y_din_y);        
-                        //
-                    end
-                    //
-                    UOP_OPCODE_MODULAR_REDUCE_INIT:
-                        //
-                        update_wide_dout(wrk_rd_narrow_x_din_x,
-                                         wrk_rd_narrow_y_din_x,
-                                         wrk_rd_narrow_x_din_y,
-                                         wrk_rd_narrow_y_din_y);
-                    //
-                    UOP_OPCODE_MERGE_LH:
-                        //
-                        update_narrow_dout(wrk_rd_wide_x_din_x,
-                                           wrk_rd_wide_y_din_x,
-                                           wrk_rd_wide_x_din_y,
-                                           wrk_rd_wide_y_din_y);
-                    //
-                endcase
-            //
-        endcase
-        //
-        // one_pass_meander
-        //
-        case (wrk_fsm_state)
-            //
-            WRK_FSM_STATE_BUSY_M2,
-            WRK_FSM_STATE_LATENCY_POST1_M2,
-            WRK_FSM_STATE_LATENCY_POST2_M2:
-                //
-                case (opcode)
-                    //
-                    UOP_OPCODE_COPY_LADDERS_X2Y: begin
-                        //
-                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
-                                         wrk_rd_wide_x_din_x_dly2,
-                                         wrk_rd_wide_x_din_y_dly3,
-                                         wrk_rd_wide_x_din_y_dly2);
-                        //
-                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
-                                           wrk_rd_narrow_x_din_x_dly2,
-                                           wrk_rd_narrow_x_din_y_dly3,
-                                           wrk_rd_narrow_x_din_y_dly2);
-                        //
-                    end
-                    //
-                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                        //
-                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
-                                         wrk_rd_wide_x_din_y_dly2,
-                                         wrk_rd_wide_x_din_y_dly3,
-                                         wrk_rd_wide_x_din_x_dly2);
-                        //
-                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
-                                           wrk_rd_narrow_x_din_y_dly2,
-                                           wrk_rd_narrow_x_din_y_dly3,
-                                           wrk_rd_narrow_x_din_x_dly2);
-                        //
-                    end
-                    //
-                    UOP_OPCODE_REGULAR_ADD_UNEVEN: begin
-                        //
-                        update_narrow_dout(regadd_x_x_trunc,
-                                           regadd_y_x_trunc,
-                                           regadd_x_y_trunc,
-                                           regadd_y_y_trunc);
-                        //
-                    end
-                    //
-                endcase
-            //
-        endcase
-        //
-        // two_pass
-        //
-        case (wrk_fsm_state)
-            //
-            WRK_FSM_STATE_BUSY_TP,
-            WRK_FSM_STATE_LATENCY_POST1_TP,
-            WRK_FSM_STATE_LATENCY_POST2_TP,
-            WRK_FSM_STATE_LATENCY_POST3_TP,
-            WRK_FSM_STATE_LATENCY_POST4_TP:
-                //
-                case (opcode)
-                    //
-                    UOP_OPCODE_MODULAR_SUBTRACT:
-                        //
-                        if (!wrk_fsm_two_pass_pass)
-                            update_narrow_dout(modsub_x_ab_dly_trunc, modsub_x_abn_trunc, modsub_y_ab_dly_trunc, modsub_y_abn_trunc);
-                        else begin
-                            update_wide_dout  (modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
-                            update_narrow_dout(modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
-                        end
-                        //
-                endcase
-            //
-        endcase
+        rd_wide_addr_is_last_half_dly   <= {rd_wide_addr_is_last_half_dly[2:0], rd_wide_addr_is_last_half};
+        rd_narrow_addr_is_last_half_dly <= {rd_narrow_addr_is_last_half_dly[2:0], rd_narrow_addr_is_last_half};
         //
     end
 
-
-    //
-    // Source Read Address Logic
-    //
-    
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next;
-
-    reg rd_wide_xy_addr_xy_next_last_seen;
-    reg rd_wide_xy_addr_xy_next_last_seen_dly1;
-    reg rd_wide_xy_addr_xy_next_last_seen_dly2;
-
-    wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half;
-    wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last;
+    task preset_rd_wide_bank_addr;
+        input [BANK_ADDR_W -1:0] bank;
+        input [  OP_ADDR_W -1:0] addr;
+        begin
+            {rd_wide_bank_x, rd_wide_addr_x} <= {bank, addr};
+            {rd_wide_bank_y, rd_wide_addr_y} <= {bank, addr};
+            rd_wide_addr_is_last      <= 1'b0;
+            rd_wide_addr_is_last_half <= 1'b0;
+        end
+    endtask
     
-    task update_rd_wide_bank_addr;
+    task preset_rd_narrow_bank_addr;
         input [BANK_ADDR_W -1:0] bank;
         input [  OP_ADDR_W -1:0] addr;
         begin
-            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, addr};
-            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, addr};
+            {rd_narrow_bank_x, rd_narrow_addr_x} <= {bank, addr};
+            {rd_narrow_bank_y, rd_narrow_addr_y} <= {bank, addr};
+            rd_narrow_addr_is_last      <= 1'b0;
+            rd_narrow_addr_is_last_half <= 1'b0;
+        end
+    endtask
+      
+    task preset_rd_wide_addr_next;
+        input [OP_ADDR_W -1:0] addr;
+        begin
+            rd_wide_addr_next              <= addr;
+            rd_wide_addr_next_is_last      <= 1'b0;
+            rd_wide_addr_next_is_last_half <= 1'b0;
         end
     endtask
 
-    task update_rd_wide_bank;
-        input [BANK_ADDR_W -1:0] bank;
+    task preset_rd_narrow_addr_next;
+        input [OP_ADDR_W -1:0] addr;
+        begin
+            rd_narrow_addr_next              <= addr;
+            rd_narrow_addr_next_is_last      <= 1'b0;
+            rd_narrow_addr_next_is_last_half <= 1'b0;
+        end
+    endtask
+    
+    task keep_rd_wide_bank;
         begin
-            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, rd_wide_xy_addr_x};
-            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, rd_wide_xy_addr_y};
+            {rd_wide_bank_x} <= {rd_wide_bank_x};
+            {rd_wide_bank_y} <= {rd_wide_bank_y};
         end
     endtask
     
-    task update_rd_narrow_bank_addr;
+    task switch_rd_wide_bank;
         input [BANK_ADDR_W -1:0] bank;
-        input [  OP_ADDR_W -1:0] addr;
         begin
-            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, addr};
-            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, addr};
+            {rd_wide_bank_x} <= {bank};
+            {rd_wide_bank_y} <= {bank};
+        end
+    endtask
+    
+    task keep_rd_wide_addr;
+        begin
+            {rd_wide_addr_x} <= {rd_wide_addr_x};
+            {rd_wide_addr_y} <= {rd_wide_addr_y};
+        end
+    endtask
+    
+    task advance_rd_wide_addr;
+        begin
+            {rd_wide_addr_x} <= {rd_wide_addr_next};
+            {rd_wide_addr_y} <= {rd_wide_addr_next};
+            rd_wide_addr_is_last      <= rd_wide_addr_next == word_index_last;
+            rd_wide_addr_is_last_half <= rd_wide_addr_next == word_index_last_half;
+        end
+    endtask
+    
+    task keep_rd_narrow_bank;
+        begin
+            {rd_narrow_bank_x} <= {rd_narrow_bank_x};
+            {rd_narrow_bank_y} <= {rd_narrow_bank_y};
         end
     endtask
     
-    task update_rd_narrow_bank;
+    task switch_rd_narrow_bank;
         input [BANK_ADDR_W -1:0] bank;
         begin
-            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, rd_narrow_xy_addr_x};
-            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, rd_narrow_xy_addr_y};
+            {rd_narrow_bank_x} <= {bank};
+            {rd_narrow_bank_y} <= {bank};
         end
     endtask
     
-    task update_rd_wide_addr_next;
-        input [OP_ADDR_W -1:0] addr;
-        rd_wide_xy_addr_xy_next <= addr;
+    task keep_rd_narrow_addr;
+        begin
+            {rd_narrow_addr_x} <= {rd_narrow_addr_x};
+            {rd_narrow_addr_y} <= {rd_narrow_addr_y};
+        end
+    endtask
+    
+    task advance_rd_narrow_addr;
+        begin
+            {rd_narrow_addr_x} <= {rd_narrow_addr_next};
+            {rd_narrow_addr_y} <= {rd_narrow_addr_next};
+            rd_narrow_addr_is_last      <= rd_narrow_addr_next == word_index_last;
+            rd_narrow_addr_is_last_half <= rd_narrow_addr_next == word_index_last_half;
+        end
+    endtask
+    
+    task update_rd_wide_addr_flags;
+        begin
+            rd_wide_addr_next_is_last      <= rd_wide_addr_next == (word_index_last      - 1'b1);
+            rd_wide_addr_next_is_last_half <= rd_wide_addr_next == (word_index_last_half - 1'b1);
+        end
     endtask
 
-    task update_rd_narrow_addr_next;
-        input [OP_ADDR_W -1:0] addr;
-        rd_narrow_xy_addr_xy_next <= addr;
+    task update_rd_narrow_addr_flags;
+        begin
+            rd_narrow_addr_next_is_last      <= rd_narrow_addr_next == (word_index_last      - 1'b1);
+            rd_narrow_addr_next_is_last_half <= rd_narrow_addr_next == (word_index_last_half - 1'b1);
+        end
     endtask
     
     task advance_rd_wide_addr_next;
-        rd_wide_xy_addr_xy_next <= !rd_wide_xy_addr_xy_next_is_last ? rd_wide_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
+        begin
+            rd_wide_addr_next <= !rd_wide_addr_next_is_last ? rd_wide_addr_next + 1'b1 : OP_ADDR_ZERO;
+            update_rd_wide_addr_flags;
+        end
     endtask
 
     task advance_rd_narrow_addr_next;
-        rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
+        begin
+            rd_narrow_addr_next <= !rd_narrow_addr_next_is_last ? rd_narrow_addr_next + 1'b1 : OP_ADDR_ZERO;
+            update_rd_narrow_addr_flags;
+        end
+    endtask 
+
+    task advance_rd_wide_addr_next_half;
+        begin
+            rd_wide_addr_next <= !rd_wide_addr_next_is_last_half ? rd_wide_addr_next + 1'b1 : OP_ADDR_ZERO;
+            update_rd_wide_addr_flags;
+        end
+    endtask
+
+    task advance_rd_narrow_addr_next_half;
+        begin
+            rd_narrow_addr_next <= !rd_narrow_addr_next_is_last_half ? rd_narrow_addr_next + 1'b1 : OP_ADDR_ZERO;
+            update_rd_narrow_addr_flags;
+        end
     endtask 
-    
-    always @(posedge clk)
-        //
-        case (opcode)
-            UOP_OPCODE_MERGE_LH:
-                case (wrk_fsm_state_next_one_pass)
-                    WRK_FSM_STATE_LATENCY_PRE1:
-                        rd_wide_xy_addr_xy_next_last_seen <= 1'b0;
-                    WRK_FSM_STATE_BUSY:
-                        if (!rd_wide_xy_addr_xy_next_last_seen && rd_wide_xy_addr_xy_next_is_last)
-                            rd_wide_xy_addr_xy_next_last_seen <= 1'b1;
-                endcase
-            UOP_OPCODE_REGULAR_ADD_UNEVEN:
-                case (wrk_fsm_state_next_one_pass_meander)
-                    WRK_FSM_STATE_LATENCY_PRE1_M1: begin
-                        rd_wide_xy_addr_xy_next_last_seen      <= 1'b0;
-                        rd_wide_xy_addr_xy_next_last_seen_dly1 <= 1'b0;
-                        rd_wide_xy_addr_xy_next_last_seen_dly2 <= 1'b0;
-                    end
-                    WRK_FSM_STATE_BUSY_M1: begin
-                        if (!rd_wide_xy_addr_xy_next_last_seen && rd_wide_xy_addr_xy_next_is_last)
-                            rd_wide_xy_addr_xy_next_last_seen <= 1'b1;
-                        rd_wide_xy_addr_xy_next_last_seen_dly1 <= rd_wide_xy_addr_xy_next_last_seen;
-                        rd_wide_xy_addr_xy_next_last_seen_dly2 <= rd_wide_xy_addr_xy_next_last_seen_dly1;
-                    end
-                endcase
-        endcase
 
     always @(posedge clk) begin
         //
-        update_rd_wide_bank_addr  (BANK_DNC, OP_ADDR_DNC);
-        update_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC);
+        preset_rd_wide_bank_addr  (BANK_DNC, OP_ADDR_DNC);
+        preset_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC);
         //
-        // one_pass
-        //
-        case (wrk_fsm_state_next_one_pass)
+        case (opcode)
             //
-            WRK_FSM_STATE_LATENCY_PRE1:
-                //
-                case (opcode)
-                    //
-                    UOP_OPCODE_PROPAGATE_CARRIES,
-                    UOP_OPCODE_OUTPUT_FROM_NARROW,
-                    UOP_OPCODE_COPY_CRT_Y2X,
-                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
-                        //
-                        update_rd_wide_bank_addr  (sel_wide_in,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
-                        update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
-                        //
-                    end
-                    //
-                    UOP_OPCODE_MERGE_LH: begin
-                        update_rd_wide_bank_addr  (BANK_WIDE_L,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
-                        update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
-                    end
-                    //
-                endcase
-                //
-            WRK_FSM_STATE_LATENCY_PRE2,
-            WRK_FSM_STATE_BUSY:
+            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_OUTPUT_FROM_NARROW,
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
-                case (opcode)
-                    //
-                    UOP_OPCODE_PROPAGATE_CARRIES,
-                    UOP_OPCODE_OUTPUT_FROM_NARROW,
-                    UOP_OPCODE_COPY_CRT_Y2X: begin
-                        //
-                        update_rd_wide_bank_addr  (sel_wide_in,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
-                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                        //
-                    end
-                    //
-                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
-                        //
-                        update_rd_wide_bank_addr  (sel_wide_in,   rd_wide_xy_addr_xy_next  ); advance_rd_wide_addr_next  ;
-                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                        //
-                    end
-                    //
-                    UOP_OPCODE_MERGE_LH: begin
-                        //
-                        if (!rd_wide_xy_addr_xy_next_last_seen) update_rd_wide_bank_addr  (BANK_WIDE_L,   rd_wide_xy_addr_xy_next  ); 
-                        else                                    update_rd_wide_bank_addr  (BANK_WIDE_H,   rd_wide_xy_addr_xy_next  );
-                                                                                                                                      advance_rd_wide_addr_next  ;
-                                                                update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                        //
-                    end
-                    //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE3,
+                    WRK_FSM_STATE_BUSY1:        begin keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_LATENCY_PRE2,
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2:              keep_rd_narrow_bank;
                 endcase
             //
-        endcase
-        //
-        // one_pass_meander
-        //
-        case (wrk_fsm_state_next_one_pass_meander)
-            //
-            WRK_FSM_STATE_LATENCY_PRE1_M1:
-                case (opcode)
-                    UOP_OPCODE_COPY_LADDERS_X2Y,
-                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                        update_rd_wide_bank_addr  (sel_wide_out,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
-                        update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
-                    end
-                    UOP_OPCODE_REGULAR_ADD_UNEVEN: begin
-                        update_rd_wide_bank_addr  (sel_wide_in, OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
-                        update_rd_narrow_bank_addr(sel_wide_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);                    
-                    end
+            UOP_OPCODE_COPY_CRT_Y2X,
+            UOP_OPCODE_MODULAR_SUBTRACT_Z,
+            UOP_OPCODE_REGULAR_ADD_UNEVEN:
+                //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr  (sel_wide_in,   OP_ADDR_ZERO); preset_rd_wide_addr_next  (OP_ADDR_ONE);
+                                                      preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE3,
+                    WRK_FSM_STATE_BUSY1:        begin keep_rd_wide_bank;   advance_rd_wide_addr;   advance_rd_wide_addr_next;
+                                                      keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_LATENCY_PRE2,
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2:        begin keep_rd_wide_bank; keep_rd_narrow_bank; end
                 endcase
             //
-            WRK_FSM_STATE_LATENCY_PRE2_M1,
-            WRK_FSM_STATE_BUSY_M1:
-                case (opcode)
-                    UOP_OPCODE_COPY_LADDERS_X2Y,
-                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                        update_rd_wide_bank_addr  (sel_wide_out,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
-                        update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                    end
-                    UOP_OPCODE_REGULAR_ADD_UNEVEN: begin
-                        update_rd_wide_bank_addr  (sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
-                        update_rd_narrow_bank_addr(sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                    end
+            UOP_OPCODE_MODULAR_REDUCE_INIT:
+                //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr  (BANK_DNC,      OP_ADDR_ZERO); preset_rd_wide_addr_next  (OP_ADDR_ONE);
+                                                      preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE3,
+                    WRK_FSM_STATE_BUSY1:        begin                      advance_rd_wide_addr;   advance_rd_wide_addr_next_half;
+                                                      keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_LATENCY_PRE2,
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2:              keep_rd_narrow_bank;
                 endcase
             //
-            WRK_FSM_STATE_LATENCY_PRE1_M2,
-            WRK_FSM_STATE_LATENCY_PRE2_M2,
-            WRK_FSM_STATE_BUSY_M2:
-                case (opcode)
-                    UOP_OPCODE_COPY_LADDERS_X2Y,
-                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                        update_rd_wide_bank  (sel_wide_in  );
-                        update_rd_narrow_bank(sel_narrow_in);
-                    end
-                    UOP_OPCODE_REGULAR_ADD_UNEVEN: begin
-                        update_rd_wide_bank  (sel_narrow_in);
-                        update_rd_narrow_bank(sel_narrow_in);
-                    end
+            UOP_OPCODE_COPY_LADDERS_X2Y,
+            UOP_OPCODE_CROSS_LADDERS_X2Y:
+                //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr  (sel_wide_in,   OP_ADDR_ZERO); preset_rd_wide_addr_next  (OP_ADDR_ONE);
+                                                      preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE2: begin switch_rd_wide_bank  (sel_wide_out);   keep_rd_wide_addr; 
+                                                      switch_rd_narrow_bank(sel_narrow_out); keep_rd_narrow_addr; end                                                      
+                    WRK_FSM_STATE_LATENCY_PRE3,
+                    WRK_FSM_STATE_BUSY1:        begin advance_rd_wide_addr;   advance_rd_wide_addr_next;   switch_rd_wide_bank(sel_wide_in);
+                                                      advance_rd_narrow_addr; advance_rd_narrow_addr_next; switch_rd_narrow_bank(sel_narrow_in); end
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2:        begin keep_rd_wide_addr;   switch_rd_wide_bank  (sel_wide_out);
+                                                      keep_rd_narrow_addr; switch_rd_narrow_bank(sel_narrow_out); end                                                      
                 endcase
             //
-        endcase
-        //
-        // two_pass
-        //
-        case (wrk_fsm_state_next_two_pass)
-            //
-            WRK_FSM_STATE_LATENCY_PRE1_TP:
+            UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
-                case (opcode)
-                    //
-                    UOP_OPCODE_MODULAR_SUBTRACT:
-                        //
-                        if (!wrk_fsm_two_pass_pass) begin
-                            update_rd_wide_bank_addr  (BANK_WIDE_N,    OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
-                            update_rd_narrow_bank_addr(sel_narrow_in,  OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
-                        end else begin
-                            update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
-                        end
-                    //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr  (BANK_WIDE_N,   OP_ADDR_ZERO); preset_rd_wide_addr_next  (OP_ADDR_ONE);
+                                                      preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE3,
+                    WRK_FSM_STATE_BUSY1:        begin keep_rd_wide_bank;   advance_rd_wide_addr;   advance_rd_wide_addr_next;
+                                                      keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_LATENCY_PRE2,
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2:        begin keep_rd_wide_bank; keep_rd_narrow_bank; end
                 endcase
+            //
+            UOP_OPCODE_MERGE_LH:
                 //
-            WRK_FSM_STATE_LATENCY_PRE2_TP,
-            WRK_FSM_STATE_LATENCY_PRE3_TP,
-            WRK_FSM_STATE_LATENCY_PRE4_TP,
-            WRK_FSM_STATE_BUSY_TP:
-                //
-                case (opcode)
-                    //
-                    UOP_OPCODE_MODULAR_SUBTRACT:
-                        //
-                        if (!wrk_fsm_two_pass_pass) begin
-                            update_rd_wide_bank_addr  (BANK_WIDE_N,    rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
-                            update_rd_narrow_bank_addr(sel_narrow_in,  rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                        end else begin
-                            update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
-                        end
-                    //
+                case (wrk_fsm_state_next)
+                    WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr  (BANK_WIDE_L, OP_ADDR_ZERO); preset_rd_wide_addr_next  (OP_ADDR_ONE);
+                                                      preset_rd_narrow_bank_addr(BANK_DNC,    OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end
+                    WRK_FSM_STATE_LATENCY_PRE3: begin keep_rd_wide_bank; advance_rd_wide_addr;   advance_rd_wide_addr_next_half;
+                                                                         advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_BUSY1:        begin if (!rd_wide_addr_is_last_half_dly[0]) keep_rd_wide_bank;
+                                                      else                                   switch_rd_wide_bank(BANK_WIDE_H);
+                                                      advance_rd_wide_addr;   advance_rd_wide_addr_next_half;
+                                                      advance_rd_narrow_addr; advance_rd_narrow_addr_next; end
+                    WRK_FSM_STATE_LATENCY_PRE2,
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2: keep_rd_wide_bank;
                 endcase
-                //
+            //
         endcase
         //
     end
@@ -927,13 +685,21 @@ module modexpng_general_worker
     //
     // Destination Write Address Logic
     //
-    
-    wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half;
-    wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half;
-
-    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_x = uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H;
-    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_y = uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H;
+    reg                    modular_reduce_init_first_half_x;
+    reg                    modular_reduce_init_first_half_y;
+    reg [BANK_ADDR_W -1:0] modular_reduce_init_sel_wide_out_x;
+    reg [BANK_ADDR_W -1:0] modular_reduce_init_sel_wide_out_y;
 
+    always @(posedge clk) begin
+        //
+        modular_reduce_init_first_half_x <= rd_narrow_addr_x_dly[1] <= word_index_last_half;
+        modular_reduce_init_first_half_y <= rd_narrow_addr_y_dly[1] <= word_index_last_half;
+        //
+        modular_reduce_init_sel_wide_out_x <= modular_reduce_init_first_half_x ? BANK_WIDE_L : BANK_WIDE_H;
+        modular_reduce_init_sel_wide_out_y <= modular_reduce_init_first_half_y ? BANK_WIDE_L : BANK_WIDE_H;
+        //
+    end
+    
     task update_wr_wide_bank_addr;
         input [BANK_ADDR_W -1:0] x_bank;
         input [BANK_ADDR_W -1:0] y_bank;
@@ -955,120 +721,351 @@ module modexpng_general_worker
             {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {y_bank, y_addr};
         end
     endtask
-    
+
     always @(posedge clk) begin
         //
         update_wr_wide_bank_addr  (BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
         update_wr_narrow_bank_addr(BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
         //
-        // one_pass
-        //
-        case (wrk_fsm_state)
+        case (opcode)
             //
-            WRK_FSM_STATE_BUSY,
-            WRK_FSM_STATE_LATENCY_POST1,
-            WRK_FSM_STATE_LATENCY_POST2:
+            UOP_OPCODE_PROPAGATE_CARRIES,
+            UOP_OPCODE_MODULAR_SUBTRACT_X,
+            UOP_OPCODE_MERGE_LH,
+            UOP_OPCODE_REGULAR_ADD_UNEVEN:
                 //
-                case (opcode)
-                    //
-                    UOP_OPCODE_PROPAGATE_CARRIES,
-                    UOP_OPCODE_COPY_CRT_Y2X: begin
-                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
-                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
-                    end
-                    //
-                    UOP_OPCODE_MODULAR_REDUCE_INIT:
-                        update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2);                    
-                    //
-                    UOP_OPCODE_MERGE_LH:
-                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
-                    //
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3: update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[3], rd_narrow_addr_y_dly[3]);
                 endcase
-                //
-        endcase
-        //
-        // one_pass_meander
-        //
-        case (wrk_fsm_state)
             //
-            WRK_FSM_STATE_BUSY_M2,
-            WRK_FSM_STATE_LATENCY_POST1_M2,
-            WRK_FSM_STATE_LATENCY_POST2_M2:
-                //        
-                case (opcode)
-                    UOP_OPCODE_COPY_LADDERS_X2Y,
-                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
-                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
-                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
-                    end
-                    UOP_OPCODE_REGULAR_ADD_UNEVEN:
-                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
+            UOP_OPCODE_COPY_CRT_Y2X,
+            UOP_OPCODE_COPY_LADDERS_X2Y,
+            UOP_OPCODE_CROSS_LADDERS_X2Y,
+            UOP_OPCODE_MODULAR_SUBTRACT_Z:
+                //
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3: begin update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[3], rd_narrow_addr_y_dly[3]);
+                                                       update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_wide_addr_x_dly[3],   rd_wide_addr_y_dly[3]  ); end
                 endcase
+            //
+            UOP_OPCODE_MODULAR_REDUCE_INIT:
+                //
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3: update_wr_wide_bank_addr(modular_reduce_init_sel_wide_out_x, modular_reduce_init_sel_wide_out_y, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3]);
+                endcase
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3: update_wr_wide_bank_addr(sel_wide_out, sel_wide_out, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3]);
+                endcase
+            //
         endcase
         //
-        // two_pass
+    end
+ 
+    
+    //
+    // UOP_OPCODE_PROPAGATE_CARRIES
+    //
+    reg [CARRY_W -1:0] propagate_carries_x_x_cry_r;
+    reg [CARRY_W -1:0] propagate_carries_y_x_cry_r;
+    reg [CARRY_W -1:0] propagate_carries_x_y_cry_r;
+    reg [CARRY_W -1:0] propagate_carries_y_y_cry_r;
+    
+    wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry = rd_narrow_x_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_x_cry_r};
+    wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry = rd_narrow_y_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_x_cry_r};
+    wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry = rd_narrow_x_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_y_cry_r};
+    wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry = rd_narrow_y_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_y_cry_r};
+    
+    reg [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_r;
+    reg [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_r;
+    reg [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_r;
+    reg [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_r;
+    
+    wire [CARRY_W -1:0] propagate_carries_x_x_w_cry_msb = propagate_carries_x_x_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] propagate_carries_y_x_w_cry_msb = propagate_carries_y_x_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] propagate_carries_x_y_w_cry_msb = propagate_carries_x_y_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] propagate_carries_y_y_w_cry_msb = propagate_carries_y_y_w_cry_r[WORD_EXT_W -1:WORD_W];
+    
+    wire [WORD_W -1:0] propagate_carries_x_x_w_cry_lsb = propagate_carries_x_x_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] propagate_carries_y_x_w_cry_lsb = propagate_carries_y_x_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] propagate_carries_x_y_w_cry_lsb = propagate_carries_x_y_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] propagate_carries_y_y_w_cry_lsb = propagate_carries_y_y_w_cry_r[WORD_W -1:0];
+    
+    wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_x_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_x_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_y_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_y_w_cry_lsb};
+    
+    task _propagate_carries_update_cry;
+        input [CARRY_W-1:0]     x_x_cry,                     y_x_cry,                     x_y_cry,                     y_y_cry;
+        {   propagate_carries_x_x_cry_r, propagate_carries_y_x_cry_r, propagate_carries_x_y_cry_r, propagate_carries_y_y_cry_r} <=
+        {                       x_x_cry,                     y_x_cry,                     x_y_cry,                     y_y_cry};
+    endtask
+    
+    task propagate_carries_clear_cry; _propagate_carries_update_cry(                     CARRY_ZERO,                      CARRY_ZERO,                      CARRY_ZERO,                      CARRY_ZERO); endtask
+    task propagate_carries_store_cry; _propagate_carries_update_cry(propagate_carries_x_x_w_cry_msb, propagate_carries_y_x_w_cry_msb, propagate_carries_x_y_w_cry_msb, propagate_carries_y_y_w_cry_msb); endtask
+        
+    task _propagate_carries_update_sum_w_cry;
+        input [WORD_EXT_W-1:0] x_x_sum_w_cry,                 y_x_sum_w_cry,                 x_y_sum_w_cry,                 y_y_sum_w_cry;
+        {      propagate_carries_x_x_w_cry_r, propagate_carries_y_x_w_cry_r, propagate_carries_x_y_w_cry_r, propagate_carries_y_y_w_cry_r} <=
+        {                      x_x_sum_w_cry,                 y_x_sum_w_cry,                 x_y_sum_w_cry,                 y_y_sum_w_cry};
+    endtask
+    
+    task propagate_carries_store_sum_w_cry; _propagate_carries_update_sum_w_cry(propagate_carries_x_x_w_cry, propagate_carries_y_x_w_cry, propagate_carries_x_y_w_cry, propagate_carries_y_y_w_cry); endtask
+
+    always @(posedge clk)
         //
-        case (wrk_fsm_state)
+        if (opcode == UOP_OPCODE_PROPAGATE_CARRIES)
+            //
+            case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_LATENCY_PRE3:  propagate_carries_clear_cry;
+                WRK_FSM_STATE_BUSY1,
+                WRK_FSM_STATE_LATENCY_POST1: propagate_carries_store_cry;
+                //
+                WRK_FSM_STATE_LATENCY_PRE4,
+                WRK_FSM_STATE_BUSY2,
+                WRK_FSM_STATE_LATENCY_POST2: propagate_carries_store_sum_w_cry;
+                //
+            endcase
+    
+    
+    //
+    // UOP_OPCODE_MODULAR_SUBTRACT_X
+    // UOP_OPCODE_MODULAR_SUBTRACT_Y
+    //
+    reg modular_subtract_x_brw_r;
+    reg modular_subtract_y_brw_r;
+    
+    reg modular_subtract_x_cry_r;
+    reg modular_subtract_y_cry_r;
+
+    wire [WORD_W:0] modular_subtract_x_w_brw = rd_narrow_x_din_x_dly1[WORD_W:0] - rd_narrow_y_din_x_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_x_brw_r};
+    wire [WORD_W:0] modular_subtract_y_w_brw = rd_narrow_x_din_y_dly1[WORD_W:0] - rd_narrow_y_din_y_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_y_brw_r};
+
+    wire [WORD_W:0] modular_subtract_x_w_cry = rd_narrow_x_din_x_dly1[WORD_W:0] + rd_wide_x_din_x_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_x_cry_r};
+    wire [WORD_W:0] modular_subtract_y_w_cry = rd_narrow_x_din_y_dly1[WORD_W:0] + rd_wide_x_din_y_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_y_brw_r};
+
+    reg [WORD_W:0] modular_subtract_x_w_brw_r;
+    reg [WORD_W:0] modular_subtract_y_w_brw_r;
+
+    reg [WORD_W:0] modular_subtract_x_w_cry_r;
+    reg [WORD_W:0] modular_subtract_y_w_cry_r;
+    
+    wire modular_subtract_x_w_brw_msb = modular_subtract_x_w_brw_r[WORD_W];
+    wire modular_subtract_y_w_brw_msb = modular_subtract_y_w_brw_r[WORD_W];
+
+    wire modular_subtract_x_w_cry_msb = modular_subtract_x_w_cry_r[WORD_W];
+    wire modular_subtract_y_w_cry_msb = modular_subtract_y_w_cry_r[WORD_W];
+    
+    wire [WORD_W -1:0] modular_subtract_x_w_brw_lsb = modular_subtract_x_w_brw_r[WORD_W -1:0];
+    wire [WORD_W -1:0] modular_subtract_y_w_brw_lsb = modular_subtract_y_w_brw_r[WORD_W -1:0];
+
+    wire [WORD_W -1:0] modular_subtract_x_w_cry_lsb = modular_subtract_x_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] modular_subtract_y_w_cry_lsb = modular_subtract_y_w_cry_r[WORD_W -1:0];
+
+    wire [WORD_EXT_W -1:0] modular_subtract_x_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_brw_lsb};
+    wire [WORD_EXT_W -1:0] modular_subtract_y_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_brw_lsb};
+
+    wire [WORD_EXT_W -1:0] modular_subtract_x_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] modular_subtract_y_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_cry_lsb};
+    
+    reg  [WORD_EXT_W -1:0] modular_subtract_x_mux;
+    reg  [WORD_EXT_W -1:0] modular_subtract_y_mux;
+    
+    wire [WORD_EXT_W -1:0] modular_subtract_x_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_mux[WORD_W-1:0]};
+    wire [WORD_EXT_W -1:0] modular_subtract_y_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_mux[WORD_W-1:0]};
+    
+    task _modular_subtract_update_brw;
+        input x_brw, y_brw;
+        {modular_subtract_x_brw_r, modular_subtract_y_brw_r} <= {x_brw, y_brw};
+    endtask
+    
+    task _modular_subtract_update_cry;
+        input x_cry, y_cry;
+        {modular_subtract_x_cry_r, modular_subtract_y_cry_r} <= {x_cry, y_cry};
+    endtask
+    
+    task modular_subtract_clear_brw; _modular_subtract_update_brw(                        1'b0,                         1'b0); endtask
+    task modular_subtract_store_brw; _modular_subtract_update_brw(modular_subtract_x_w_brw_msb, modular_subtract_y_w_brw_msb); endtask
+
+    task modular_subtract_clear_cry; _modular_subtract_update_cry(                        1'b0,                         1'b0); endtask
+    task modular_subtract_store_cry; _modular_subtract_update_cry(modular_subtract_x_w_cry_msb, modular_subtract_y_w_cry_msb); endtask
+    
+    task _modular_subtract_update_diff_w_brw;
+        input [WORD_W:0] x_diff_w_brw, y_diff_w_brw;
+        {modular_subtract_x_w_brw_r, modular_subtract_y_w_brw_r} <= {x_diff_w_brw, y_diff_w_brw};
+    endtask
+
+    task _modular_subtract_update_sum_w_cry;
+        input [WORD_W:0] x_sum_w_cry, y_sum_w_cry;
+        {modular_subtract_x_w_cry_r, modular_subtract_y_w_cry_r} <= {x_sum_w_cry, y_sum_w_cry};
+    endtask
+    
+    task modular_subtract_store_diff_w_brw; _modular_subtract_update_diff_w_brw(modular_subtract_x_w_brw, modular_subtract_y_w_brw); endtask
+
+    task modular_subtract_store_sum_w_cry; _modular_subtract_update_sum_w_cry(modular_subtract_x_w_cry, modular_subtract_y_w_cry); endtask
+    
+    always @(posedge clk)
+        //
+        case (opcode)
             //
-            WRK_FSM_STATE_BUSY_TP,
-            WRK_FSM_STATE_LATENCY_POST1_TP,
-            WRK_FSM_STATE_LATENCY_POST2_TP,
-            WRK_FSM_STATE_LATENCY_POST3_TP,
-            WRK_FSM_STATE_LATENCY_POST4_TP:
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
-                case (opcode)
+                case (wrk_fsm_state)
                     //
-                    UOP_OPCODE_MODULAR_SUBTRACT:
-                        //
-                        if (!wrk_fsm_two_pass_pass) begin
-                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);                     
-                        end else begin
-                            update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
-                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
-                        end 
+                    WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_brw;
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3: modular_subtract_store_brw; // we need the very last borrow here too!
+                    //
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_diff_w_brw;
+                    //
+                endcase
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Y:
+                //
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_LATENCY_PRE3:  modular_subtract_clear_cry;
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1: modular_subtract_store_cry;
+                    //
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_sum_w_cry;
+                    //
+                endcase
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Z:
+                //
+                case (wrk_fsm_state)
+                    // 
+                    WRK_FSM_STATE_LATENCY_PRE4,
+                    WRK_FSM_STATE_BUSY2,
+                    WRK_FSM_STATE_LATENCY_POST2:
                         //
-                    endcase
+                        begin modular_subtract_x_mux <= !modular_subtract_x_brw_r ? rd_narrow_x_din_x_dly1 : rd_wide_x_din_x_dly1;
+                              modular_subtract_y_mux <= !modular_subtract_y_brw_r ? rd_narrow_x_din_y_dly1 : rd_wide_x_din_y_dly1; end
+                    //
+                endcase            
+            //
+        endcase
+
+
+    //
+    // UOP_OPCODE_REGULAR_ADD_UNEVEN
+    //
+    reg [CARRY_W -1:0] regular_add_uneven_x_x_cry_r;
+    reg [CARRY_W -1:0] regular_add_uneven_y_x_cry_r;
+    reg [CARRY_W -1:0] regular_add_uneven_x_y_cry_r;
+    reg [CARRY_W -1:0] regular_add_uneven_y_y_cry_r;
+    
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_msb_w_cry = rd_narrow_x_din_x_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_x_x_cry_r};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_msb_w_cry = rd_narrow_y_din_x_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_y_x_cry_r};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_msb_w_cry = rd_narrow_x_din_y_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_x_y_cry_r};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_msb_w_cry = rd_narrow_y_din_y_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_y_y_cry_r};
+    
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_lsb_w_cry = regular_add_uneven_x_x_msb_w_cry + rd_wide_x_din_x_dly1;
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_lsb_w_cry = regular_add_uneven_y_x_msb_w_cry + rd_wide_y_din_x_dly1;
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_lsb_w_cry = regular_add_uneven_x_y_msb_w_cry + rd_wide_x_din_y_dly1;
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_lsb_w_cry = regular_add_uneven_y_y_msb_w_cry + rd_wide_y_din_y_dly1;
+    
+    reg [WORD_EXT_W -1:0] regular_add_uneven_x_x_w_cry_r;
+    reg [WORD_EXT_W -1:0] regular_add_uneven_y_x_w_cry_r;
+    reg [WORD_EXT_W -1:0] regular_add_uneven_x_y_w_cry_r;
+    reg [WORD_EXT_W -1:0] regular_add_uneven_y_y_w_cry_r;
+    
+    wire [CARRY_W -1:0] regular_add_uneven_x_x_w_cry_msb = regular_add_uneven_x_x_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] regular_add_uneven_y_x_w_cry_msb = regular_add_uneven_y_x_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] regular_add_uneven_x_y_w_cry_msb = regular_add_uneven_x_y_w_cry_r[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] regular_add_uneven_y_y_w_cry_msb = regular_add_uneven_y_y_w_cry_r[WORD_EXT_W -1:WORD_W];
+    
+    wire [WORD_W -1:0] regular_add_uneven_x_x_w_cry_lsb = regular_add_uneven_x_x_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] regular_add_uneven_y_x_w_cry_lsb = regular_add_uneven_y_x_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] regular_add_uneven_x_y_w_cry_lsb = regular_add_uneven_x_y_w_cry_r[WORD_W -1:0];
+    wire [WORD_W -1:0] regular_add_uneven_y_y_w_cry_lsb = regular_add_uneven_y_y_w_cry_r[WORD_W -1:0];
+    
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_x_x_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_y_x_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_x_y_w_cry_lsb};
+    wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_y_y_w_cry_lsb};
+    
+    reg regular_add_uneven_store_lsb_now;
+    
+    task _regular_add_uneven_update_cry;
+        input [CARRY_W-1:0]     x_x_cry,                      y_x_cry,                      x_y_cry,                      y_y_cry;
+        {  regular_add_uneven_x_x_cry_r, regular_add_uneven_y_x_cry_r, regular_add_uneven_x_y_cry_r, regular_add_uneven_y_y_cry_r} <=
+        {                       x_x_cry,                      y_x_cry,                      x_y_cry,                      y_y_cry};
+    endtask
+    
+    task regular_add_uneven_clear_cry; _regular_add_uneven_update_cry(                      CARRY_ZERO,                       CARRY_ZERO,                       CARRY_ZERO,                       CARRY_ZERO); endtask
+    task regular_add_uneven_store_cry; _regular_add_uneven_update_cry(regular_add_uneven_x_x_w_cry_msb, regular_add_uneven_y_x_w_cry_msb, regular_add_uneven_x_y_w_cry_msb, regular_add_uneven_y_y_w_cry_msb); endtask
+        
+    task _regular_add_uneven_update_sum_w_cry;
+        input [WORD_EXT_W-1:0] x_x_sum_w_cry,                  y_x_sum_w_cry,                  x_y_sum_w_cry,                  y_y_sum_w_cry;
+        {     regular_add_uneven_x_x_w_cry_r, regular_add_uneven_y_x_w_cry_r, regular_add_uneven_x_y_w_cry_r, regular_add_uneven_y_y_w_cry_r} <=
+        {                      x_x_sum_w_cry,                  y_x_sum_w_cry,                  x_y_sum_w_cry,                  y_y_sum_w_cry};
+    endtask
+    
+    task regular_add_uneven_store_sum_lsb_w_cry; _regular_add_uneven_update_sum_w_cry(regular_add_uneven_x_x_lsb_w_cry, regular_add_uneven_y_x_lsb_w_cry, regular_add_uneven_x_y_lsb_w_cry, regular_add_uneven_y_y_lsb_w_cry); endtask
+    
+    task regular_add_uneven_store_sum_msb_w_cry; _regular_add_uneven_update_sum_w_cry(regular_add_uneven_x_x_msb_w_cry, regular_add_uneven_y_x_msb_w_cry, regular_add_uneven_x_y_msb_w_cry, regular_add_uneven_y_y_msb_w_cry); endtask
+
+    always @(posedge clk)
+        //
+           case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_LATENCY_PRE3: regular_add_uneven_store_lsb_now <= 1'b1;
+                WRK_FSM_STATE_BUSY1: if (rd_wide_addr_is_last_half_dly[3]) regular_add_uneven_store_lsb_now <= 1'b0;         
                 //
             endcase
+            
+    always @(posedge clk)
         //
-    end
+           case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_LATENCY_PRE3:  regular_add_uneven_clear_cry;
+                WRK_FSM_STATE_BUSY1,
+                WRK_FSM_STATE_LATENCY_POST1: regular_add_uneven_store_cry;
+                //
+                WRK_FSM_STATE_LATENCY_PRE4:                                        regular_add_uneven_store_sum_lsb_w_cry;
+                WRK_FSM_STATE_BUSY2:         if (regular_add_uneven_store_lsb_now) regular_add_uneven_store_sum_lsb_w_cry;
+                                             else                                  regular_add_uneven_store_sum_msb_w_cry;
+                WRK_FSM_STATE_LATENCY_POST2:                                       regular_add_uneven_store_sum_msb_w_cry;
+                //
+            endcase
 
 
     //
     // FSM Process
     //
-
     always @(posedge clk or negedge rst_n)
         //
         if (!rst_n) wrk_fsm_state <= WRK_FSM_STATE_IDLE;
-        else case (opcode)
-            UOP_OPCODE_PROPAGATE_CARRIES,
-            UOP_OPCODE_OUTPUT_FROM_NARROW,
-            UOP_OPCODE_COPY_CRT_Y2X,
-            UOP_OPCODE_MODULAR_REDUCE_INIT,
-            UOP_OPCODE_MERGE_LH:            wrk_fsm_state <= wrk_fsm_state_next_one_pass;
-            UOP_OPCODE_COPY_LADDERS_X2Y,
-            UOP_OPCODE_CROSS_LADDERS_X2Y,
-            UOP_OPCODE_REGULAR_ADD_UNEVEN:   wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander;
-            UOP_OPCODE_MODULAR_SUBTRACT:  wrk_fsm_state <= wrk_fsm_state_next_two_pass;
-            default:                        wrk_fsm_state <= WRK_FSM_STATE_IDLE;
-        endcase
-    
-  
+        else        wrk_fsm_state <= wrk_fsm_state_next;
+
+
     //
     // Busy Exit Logic
-    //
-    
-    reg wrk_fsm_done_one_pass         = 1'b0;
-    reg wrk_fsm_done_one_pass_meander = 1'b0;
-    reg wrk_fsm_done_two_pass         = 1'b0;
+    //    
+    reg wrk_fsm_done = 1'b0;
     
     always @(posedge clk) begin
         //
-        wrk_fsm_done_one_pass         <= 1'b0;
-        wrk_fsm_done_one_pass_meander <= 1'b0;
-        wrk_fsm_done_two_pass         <= 1'b0;
+        wrk_fsm_done <= 1'b0;
         //
         case (opcode)
             //
@@ -1076,47 +1073,22 @@ module modexpng_general_worker
             UOP_OPCODE_OUTPUT_FROM_NARROW,
             UOP_OPCODE_COPY_CRT_Y2X,
             UOP_OPCODE_MODULAR_REDUCE_INIT,
-            UOP_OPCODE_MERGE_LH:
-                //
-                case (wrk_fsm_state)
-                    WRK_FSM_STATE_BUSY:
-                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass <= 1'b1;
-                endcase
-                //
             UOP_OPCODE_COPY_LADDERS_X2Y,
             UOP_OPCODE_CROSS_LADDERS_X2Y,
+            UOP_OPCODE_MODULAR_SUBTRACT_X,
+            UOP_OPCODE_MODULAR_SUBTRACT_Y,
+            UOP_OPCODE_MODULAR_SUBTRACT_Z,
+            UOP_OPCODE_MERGE_LH,
             UOP_OPCODE_REGULAR_ADD_UNEVEN:
                 //
                 case (wrk_fsm_state)
-                    WRK_FSM_STATE_BUSY_M2:
-                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1;
-                    WRK_FSM_STATE_BUSY_M1:
-                        wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander;
+                    WRK_FSM_STATE_BUSY1:
+                        if (rd_narrow_addr_is_last) wrk_fsm_done <= 1'b1;
                 endcase
-                //
-            UOP_OPCODE_MODULAR_SUBTRACT:
-                //
-                case (wrk_fsm_state)
-                    WRK_FSM_STATE_BUSY_TP:
-                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_two_pass <= 1'b1;
-                endcase
-                //
             //
         endcase
         //
     end
-    
-    
-    //
-    // FSM Helper Logic
-    //    
-    always @(posedge clk)
-        //
-        case (wrk_fsm_state)
-            WRK_FSM_STATE_IDLE: if (ena)    {wrk_fsm_two_pass_pass, wrk_fsm_two_pass_pass_dly} <= {1'b0, 1'b0};
-            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_two_pass_pass <= 1'b1;
-            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_two_pass_pass_dly <= 1'b1;
-        endcase
 
 
     //
@@ -1125,64 +1097,26 @@ module modexpng_general_worker
     always @* begin
         //
         case (wrk_fsm_state)
-            WRK_FSM_STATE_IDLE:          wrk_fsm_state_next_one_pass = ena                   ? WRK_FSM_STATE_LATENCY_PRE1  : WRK_FSM_STATE_IDLE ;
-            WRK_FSM_STATE_LATENCY_PRE1:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_PRE2  ;
-            WRK_FSM_STATE_LATENCY_PRE2:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_BUSY          ;
-            WRK_FSM_STATE_BUSY:          wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ;
-            WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_POST2 ;
-            WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_STOP          ;
-            WRK_FSM_STATE_STOP:          wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
-            default:                     wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
-        endcase
-        //
-    end
-    
-    always @* begin
-        //
-        case (wrk_fsm_state)
-            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_one_pass_meander = ena                           ? WRK_FSM_STATE_LATENCY_PRE1_M1  : WRK_FSM_STATE_IDLE    ;           
-            //
-            WRK_FSM_STATE_LATENCY_PRE1_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE1_M2  ;
-            WRK_FSM_STATE_LATENCY_PRE1_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M1  ;
-            WRK_FSM_STATE_LATENCY_PRE2_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M2  ;
-            WRK_FSM_STATE_LATENCY_PRE2_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M1          ;
-            WRK_FSM_STATE_BUSY_M1:          wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M2          ;
-            WRK_FSM_STATE_BUSY_M2:          wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ;
-            WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST1_M2 ;
-            WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M1 ;
-            WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M2 ;
-            WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_STOP             ;
-            //
-            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
-            //
-            default:                        wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
-        endcase
-        //
-    end
-    
-    always @* begin
-        //
-        case (wrk_fsm_state)
-            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_two_pass = ena                       ? WRK_FSM_STATE_LATENCY_PRE1_TP  : WRK_FSM_STATE_IDLE;
-            WRK_FSM_STATE_LATENCY_PRE1_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE2_TP  ;
-            WRK_FSM_STATE_LATENCY_PRE2_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE3_TP  ;
-            WRK_FSM_STATE_LATENCY_PRE3_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE4_TP  ;
-            WRK_FSM_STATE_LATENCY_PRE4_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_BUSY_TP          ;
-            WRK_FSM_STATE_BUSY_TP:          wrk_fsm_state_next_two_pass = wrk_fsm_done_two_pass ?     WRK_FSM_STATE_LATENCY_POST1_TP : WRK_FSM_STATE_BUSY_TP;
-            WRK_FSM_STATE_LATENCY_POST1_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST2_TP ;
-            WRK_FSM_STATE_LATENCY_POST2_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST3_TP ;
-            WRK_FSM_STATE_LATENCY_POST3_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST4_TP ;
-            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_HOLDOFF_TP       ;
-            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_state_next_two_pass = wrk_fsm_two_pass_pass_dly ? WRK_FSM_STATE_STOP             : WRK_FSM_STATE_LATENCY_PRE1_TP; 
-            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
-            default:                        wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
+            WRK_FSM_STATE_IDLE:          wrk_fsm_state_next = ena          ? WRK_FSM_STATE_LATENCY_PRE1  : WRK_FSM_STATE_IDLE  ;
+            WRK_FSM_STATE_LATENCY_PRE1:  wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_PRE2  ;
+            WRK_FSM_STATE_LATENCY_PRE2:  wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_PRE3  ;
+            WRK_FSM_STATE_LATENCY_PRE3:  wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_PRE4  ;
+            WRK_FSM_STATE_LATENCY_PRE4:  wrk_fsm_state_next =                WRK_FSM_STATE_BUSY1         ;
+            WRK_FSM_STATE_BUSY1:         wrk_fsm_state_next =                WRK_FSM_STATE_BUSY2         ;
+            WRK_FSM_STATE_BUSY2:         wrk_fsm_state_next = wrk_fsm_done ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY1 ;
+            WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_POST2 ;
+            WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_POST3 ;
+            WRK_FSM_STATE_LATENCY_POST3: wrk_fsm_state_next =                WRK_FSM_STATE_LATENCY_POST4 ;
+            WRK_FSM_STATE_LATENCY_POST4: wrk_fsm_state_next =                WRK_FSM_STATE_STOP          ;
+            WRK_FSM_STATE_STOP:          wrk_fsm_state_next =                WRK_FSM_STATE_IDLE          ;
+            default:                     wrk_fsm_state_next =                WRK_FSM_STATE_IDLE          ;
         endcase
         //
     end
-    
-    
+
+
     //
-    // Ready Logic
+    // Ready Flag Logic
     //
     reg rdy_reg = 1'b1;
     
@@ -1198,321 +1132,167 @@ module modexpng_general_worker
 
 
     //
-    // UOP_OPCODE_PROPAGATE_CARRIES
+    // Source to Destination Data Logic
     //
-    reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r;
-    reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r;
-    reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r;
-    reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r;
-    
-    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r};
-    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r};
-    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r};
-    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r};
-    
-    wire [CARRY_W -1:0] rd_narrow_x_din_x_w_cry_msb = rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] rd_narrow_y_din_x_w_cry_msb = rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] rd_narrow_x_din_y_w_cry_msb = rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W];
-    wire [CARRY_W -1:0] rd_narrow_y_din_y_w_cry_msb = rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W];
-    
-    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]};
-    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]};
-    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]};
-    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]};
-    
+    reg [WORD_EXT_W -1:0] rd_wide_x_din_x_dly2;
+    reg [WORD_EXT_W -1:0] rd_wide_y_din_x_dly2;
+    reg [WORD_EXT_W -1:0] rd_wide_x_din_y_dly2;
+    reg [WORD_EXT_W -1:0] rd_wide_y_din_y_dly2;
+    reg [WORD_EXT_W -1:0] rd_narrow_x_din_x_dly2;
+    reg [WORD_EXT_W -1:0] rd_narrow_y_din_x_dly2;
+    reg [WORD_EXT_W -1:0] rd_narrow_x_din_y_dly2;
+    reg [WORD_EXT_W -1:0] rd_narrow_y_din_y_dly2;
+
+    always @(posedge clk) begin
+        {rd_wide_x_din_x_dly2,   rd_wide_y_din_x_dly2,   rd_wide_x_din_y_dly2,   rd_wide_y_din_y_dly2  } <= {rd_wide_x_din_x_dly1,   rd_wide_y_din_x_dly1,   rd_wide_x_din_y_dly1,   rd_wide_y_din_y_dly1  };
+        {rd_narrow_x_din_x_dly2, rd_narrow_y_din_x_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2} <= {rd_narrow_x_din_x_dly1, rd_narrow_y_din_x_dly1, rd_narrow_x_din_y_dly1, rd_narrow_y_din_y_dly1}; 
+    end
+
     task update_wide_dout;
         input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
         {wr_wide_x_dout_x, wr_wide_y_dout_x, wr_wide_x_dout_y, wr_wide_y_dout_y} <=
-        {        x_x,              y_x,              x_y,              y_y     };
+        {             x_x,              y_x,              x_y,              y_y};
     endtask
     
     task update_narrow_dout;
         input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
         {wr_narrow_x_dout_x, wr_narrow_y_dout_x, wr_narrow_x_dout_y, wr_narrow_y_dout_y} <=
-        {          x_x,                y_x,                x_y,                y_y     };
-    endtask
-    
-    task update_narrow_carries;
-        input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry;
-        {rd_narrow_x_din_x_cry_r, rd_narrow_y_din_x_cry_r, rd_narrow_x_din_y_cry_r, rd_narrow_y_din_y_cry_r} <=
-        {          x_x_cry,                 y_x_cry,                 x_y_cry,                 y_y_cry      };
+        {               x_x,                y_x,                x_y,                y_y};
     endtask
-    
-    always @(posedge clk)
-        //
-        if (opcode == UOP_OPCODE_PROPAGATE_CARRIES)
-            //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_LATENCY_PRE2:
-                    //
-                    update_narrow_carries(CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO);
-                //
-                WRK_FSM_STATE_BUSY,
-                WRK_FSM_STATE_LATENCY_POST1:
-                    //
-                    update_narrow_carries(rd_narrow_x_din_x_w_cry_msb,
-                                          rd_narrow_y_din_x_w_cry_msb,
-                                          rd_narrow_x_din_y_w_cry_msb,
-                                          rd_narrow_y_din_y_w_cry_msb);
-                //
-            endcase
-
-
-    //
-    // UOP_OPCODE_MODULAR_SUBTRACT
-    //
-    
-    reg [WORD_W:0] modsub_x_ab; 
-    reg [WORD_W:0] modsub_y_ab;
 
-    reg [WORD_W:0] modsub_x_ab_dly; 
-    reg [WORD_W:0] modsub_y_ab_dly;
-
-    reg [WORD_W:0] modsub_x_abn; 
-    reg [WORD_W:0] modsub_y_abn;    
-    
-    reg            modsub_x_ab_mask_now;
-    reg            modsub_y_ab_mask_now;
-
-    reg            modsub_x_abn_mask_now;
-    reg            modsub_y_abn_mask_now;
-
-    reg            modsub_x_borrow_r;
-    reg            modsub_y_borrow_r;
-    
-    wire           modsub_x_ab_masked = modsub_x_ab_mask_now ? 1'b0 : modsub_x_ab[WORD_W];
-    wire           modsub_y_ab_masked = modsub_y_ab_mask_now ? 1'b0 : modsub_y_ab[WORD_W];
-
-    wire           modsub_x_abn_masked = modsub_x_abn_mask_now ? 1'b0 : modsub_x_abn[WORD_W];  
-    wire           modsub_y_abn_masked = modsub_y_abn_mask_now ? 1'b0 : modsub_y_abn[WORD_W];
-
-    wire [WORD_W:0] modsub_x_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
-    wire [WORD_W:0] modsub_y_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
-    wire [WORD_W:0] modsub_x_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
-    wire [WORD_W:0] modsub_y_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
-    
-    wire [WORD_W:0] modsub_x_wide_x_lsb_pad = {1'b0, wrk_rd_wide_x_din_x_dly1[WORD_W-1:0]};
-    wire [WORD_W:0] modsub_x_wide_y_lsb_pad = {1'b0, wrk_rd_wide_x_din_y_dly1[WORD_W-1:0]};
-    
-    wire [WORD_EXT_W -1:0] modsub_x_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_x_ab_dly[WORD_W-1:0]};  
-    wire [WORD_EXT_W -1:0] modsub_y_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_y_ab_dly[WORD_W-1:0]};
-
-    wire [WORD_EXT_W -1:0] modsub_x_abn_trunc = {{CARRY_W{1'b0}}, modsub_x_abn[WORD_W-1:0]};  
-    wire [WORD_EXT_W -1:0] modsub_y_abn_trunc = {{CARRY_W{1'b0}}, modsub_y_abn[WORD_W-1:0]};
-    
-    wire [WORD_EXT_W -1:0] modsub_x_mux = !modsub_x_borrow_r ? wrk_rd_narrow_x_din_x_dly2 : wrk_rd_narrow_y_din_x_dly2;
-    wire [WORD_EXT_W -1:0] modsub_y_mux = !modsub_y_borrow_r ? wrk_rd_narrow_x_din_y_dly2 : wrk_rd_narrow_y_din_y_dly2;
-
-    wire [WORD_W:0] modsub_x_ab_lsb_pad = {1'b0, modsub_x_ab[WORD_W-1:0]};
-    wire [WORD_W:0] modsub_y_ab_lsb_pad = {1'b0, modsub_y_ab[WORD_W-1:0]};
-    
-    task update_modsub_ab;
-        begin
-            modsub_x_ab <= modsub_x_narrow_x_lsb_pad - modsub_y_narrow_x_lsb_pad - modsub_x_ab_masked;
-            modsub_y_ab <= modsub_x_narrow_y_lsb_pad - modsub_y_narrow_y_lsb_pad - modsub_y_ab_masked;
-        end
-    endtask
-
-    task update_modsub_abn;
-        begin
-            modsub_x_abn <= modsub_x_ab_lsb_pad + modsub_x_wide_x_lsb_pad + modsub_x_abn_masked;
-            modsub_y_abn <= modsub_y_ab_lsb_pad + modsub_x_wide_y_lsb_pad + modsub_y_abn_masked;
-        end
-    endtask
-    
-    always @(posedge clk)
-        //
-        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
-            //
-            case (wrk_fsm_state)
-                WRK_FSM_STATE_LATENCY_POST4_TP:
-                    if (!wrk_fsm_two_pass_pass)
-                        {modsub_x_borrow_r, modsub_y_borrow_r} <= {modsub_x_ab_dly[WORD_W], modsub_y_ab_dly[WORD_W]};
-            endcase
-    
-    always @(posedge clk) begin
-        modsub_x_ab_dly <= modsub_x_ab;  
-        modsub_y_ab_dly <= modsub_y_ab;
-    end
-    
     always @(posedge clk) begin
         //
-        modsub_x_ab <= {1'bX, WORD_DNC};
-        modsub_y_ab <= {1'bX, WORD_DNC};
-        //
-        modsub_x_abn <= {1'bX, WORD_DNC};
-        modsub_y_abn <= {1'bX, WORD_DNC};
+        update_wide_dout  (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
+        update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
         //
-        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
+        case (opcode)
             //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_LATENCY_PRE3_TP:
-                    update_modsub_ab;
-                    
-                WRK_FSM_STATE_LATENCY_PRE4_TP,
-                WRK_FSM_STATE_BUSY_TP,
-                WRK_FSM_STATE_LATENCY_POST1_TP,
-                WRK_FSM_STATE_LATENCY_POST2_TP: begin
-                    update_modsub_ab;
-                    update_modsub_abn;
-                end
+            UOP_OPCODE_PROPAGATE_CARRIES:
                 //
-                WRK_FSM_STATE_LATENCY_POST3_TP:
+                case (wrk_fsm_state)
                     //
-                    update_modsub_abn;
-                //
-            endcase
-        //
-    end
-
-    always @(posedge clk) begin
-        //
-        modsub_x_ab_mask_now <= 1'b0;
-        modsub_y_ab_mask_now <= 1'b0;
-        //
-        modsub_x_abn_mask_now <= 1'b0;
-        modsub_y_abn_mask_now <= 1'b0;
-        //
-        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_narrow_dout(propagate_carries_x_x_w_cry_reduced, propagate_carries_y_x_w_cry_reduced, propagate_carries_x_y_w_cry_reduced, propagate_carries_y_y_w_cry_reduced);
+                    // 
+                endcase
             //
-            case (wrk_fsm_state)
-                //
-                WRK_FSM_STATE_LATENCY_PRE2_TP: begin
-                    modsub_x_ab_mask_now <= 1'b1;
-                    modsub_y_ab_mask_now <= 1'b1;
-                end
+            UOP_OPCODE_COPY_CRT_Y2X:
                 //
-                WRK_FSM_STATE_LATENCY_PRE3_TP: begin
-                    modsub_x_abn_mask_now <= 1'b1;
-                    modsub_y_abn_mask_now <= 1'b1;
-                end
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        begin update_narrow_dout(rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2);        
+                              update_wide_dout  (rd_wide_x_din_y_dly2,   rd_wide_y_din_y_dly2,   rd_wide_x_din_y_dly2,   rd_wide_y_din_y_dly2); end
+                    //
+                endcase
+            //    
+            UOP_OPCODE_MODULAR_REDUCE_INIT:
                 //
-            endcase
-        //     
-    end
-    
-  
-    //
-    // UOP_OPCODE_ADD_UNEVEN
-    //
-    reg [WORD_W:0] regadd_x_x;
-    reg [WORD_W:0] regadd_y_x;
-    reg [WORD_W:0] regadd_x_y;
-    reg [WORD_W:0] regadd_y_y;
-    
-    reg            regadd_x_x_cry;
-    reg            regadd_y_x_cry;
-    reg            regadd_x_y_cry;
-    reg            regadd_y_y_cry;
-    
-    wire [WORD_EXT_W-1:0] regadd_x_x_trunc = {{CARRY_W{1'b0}}, regadd_x_x[WORD_W-1:0]};
-    wire [WORD_EXT_W-1:0] regadd_y_x_trunc = {{CARRY_W{1'b0}}, regadd_y_x[WORD_W-1:0]};
-    wire [WORD_EXT_W-1:0] regadd_x_y_trunc = {{CARRY_W{1'b0}}, regadd_x_y[WORD_W-1:0]};
-    wire [WORD_EXT_W-1:0] regadd_y_y_trunc = {{CARRY_W{1'b0}}, regadd_y_y[WORD_W-1:0]};
-    
-    //wire           regadd_x_x_masked = regadd_xy_ab_x_mask_now ? 1'b0 : regadd_x_x[WORD_W];
-    //wire           regadd_y_x_masked = regadd_xy_ab_x_mask_now ? 1'b0 : regadd_y_x[WORD_W];
-    //wire           regadd_x_y_masked = regadd_xy_ab_y_mask_now ? 1'b0 : regadd_x_y[WORD_W];  
-    //wire           regadd_y_y_masked = regadd_xy_ab_y_mask_now ? 1'b0 : regadd_y_y[WORD_W];
-    /**/
-    reg [WORD_W:0] regadd_x_x_a_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_x_dly2[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_x_x_b_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_x_dly1[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_y_x_a_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_x_dly2[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_y_x_b_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_x_dly1[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_x_y_a_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_y_dly2[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_x_y_b_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_y_dly1[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_y_y_a_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_y_dly2[WORD_W-1:0]};
-    reg [WORD_W:0] regadd_y_y_b_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_y_dly1[WORD_W-1:0]};
-        /**/
-    //WRK_FSM_STATE_BUSY_M1,
-    //WRK_FSM_STATE_LATENCY_POST1_M1,
-    //WRK_FSM_STATE_LATENCY_POST2_M1:
-
-    always @(posedge clk) begin
-        //
-        regadd_x_x_a_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_x_x_b_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_y_x_a_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_y_x_b_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_x_y_a_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_x_y_b_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_y_y_a_lsb_pad <= {1'bX, WORD_DNC};
-        regadd_y_y_b_lsb_pad <= {1'bX, WORD_DNC};
-        //
-        if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN)
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_wide_dout(rd_narrow_x_din_x_dly2, rd_narrow_y_din_x_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2);
+                    // 
+                endcase
             //
-            case (wrk_fsm_state)
+            UOP_OPCODE_COPY_LADDERS_X2Y:
                 //
-                WRK_FSM_STATE_LATENCY_PRE2_M2,
-                WRK_FSM_STATE_BUSY_M2,
-                WRK_FSM_STATE_LATENCY_POST1_M2: begin
-                    regadd_x_x_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_x_din_x_dly1[WORD_W-1:0] : WORD_ZERO};
-                    regadd_x_x_b_lsb_pad <= {1'b0,                                           wrk_rd_narrow_x_din_x     [WORD_W-1:0]            };
-                    regadd_y_x_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_y_din_x_dly1[WORD_W-1:0] : WORD_ZERO};
-                    regadd_y_x_b_lsb_pad <= {1'b0,                                           wrk_rd_narrow_y_din_x     [WORD_W-1:0]            };
-                    regadd_x_y_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_x_din_y_dly1[WORD_W-1:0] : WORD_ZERO};
-                    regadd_x_y_b_lsb_pad <= {1'b0,                                           wrk_rd_narrow_x_din_y     [WORD_W-1:0]            };
-                    regadd_y_y_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_y_din_y_dly1[WORD_W-1:0] : WORD_ZERO};
-                    regadd_y_y_b_lsb_pad <= {1'b0,                                           wrk_rd_narrow_y_din_y     [WORD_W-1:0]            };
-                end
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        begin update_wide_dout  (rd_wide_x_din_x_dly1,   rd_wide_x_din_x_dly2,   rd_wide_x_din_y_dly1,   rd_wide_x_din_y_dly2);
+                              update_narrow_dout(rd_narrow_x_din_x_dly1, rd_narrow_x_din_x_dly2, rd_narrow_x_din_y_dly1, rd_narrow_x_din_y_dly2); end
+                   //
+                endcase
+            //
+            UOP_OPCODE_CROSS_LADDERS_X2Y:
                 //
-            endcase
-    end
-    
-    always @(posedge clk) begin
-        //
-        regadd_x_x <= {1'bX, WORD_DNC};
-        regadd_y_x <= {1'bX, WORD_DNC};
-        regadd_x_y <= {1'bX, WORD_DNC};
-        regadd_y_y <= {1'bX, WORD_DNC};
-        //
-        if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN)
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        begin update_wide_dout  (rd_wide_x_din_x_dly1,   rd_wide_x_din_y_dly2,   rd_wide_x_din_y_dly1,   rd_wide_x_din_x_dly2);
+                              update_narrow_dout(rd_narrow_x_din_x_dly1, rd_narrow_x_din_y_dly2, rd_narrow_x_din_y_dly1, rd_narrow_x_din_x_dly2); end
+                  //                    
+                endcase
             //
-            case (wrk_fsm_state)
+            UOP_OPCODE_MODULAR_SUBTRACT_X:
                 //
-                WRK_FSM_STATE_BUSY_M1,
-                WRK_FSM_STATE_LATENCY_POST1_M1,
-                WRK_FSM_STATE_LATENCY_POST2_M1: begin
-                    regadd_x_x <= regadd_x_x_a_lsb_pad + regadd_x_x_b_lsb_pad + regadd_x_x_cry;
-                    regadd_y_x <= regadd_y_x_a_lsb_pad + regadd_y_x_b_lsb_pad + regadd_y_x_cry;
-                    regadd_x_y <= regadd_x_y_a_lsb_pad + regadd_x_y_b_lsb_pad + regadd_x_y_cry;
-                    regadd_y_y <= regadd_y_y_a_lsb_pad + regadd_y_y_b_lsb_pad + regadd_y_y_cry;
-                end
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_narrow_dout(modular_subtract_x_w_brw_reduced, modular_subtract_x_w_brw_reduced, modular_subtract_y_w_brw_reduced, modular_subtract_y_w_brw_reduced);
+                    //
+                endcase
+            //
+            UOP_OPCODE_MODULAR_SUBTRACT_Y:
                 //
-            endcase
-        //
-    end
-    
-    always @(posedge clk) begin
-        //
-        regadd_x_x_cry <= 1'bX;
-        regadd_y_x_cry <= 1'bX;
-        regadd_x_y_cry <= 1'bX;
-        regadd_y_y_cry <= 1'bX;
-        //
-        if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN)
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_wide_dout(modular_subtract_x_w_cry_reduced, modular_subtract_x_w_cry_reduced, modular_subtract_y_w_cry_reduced, modular_subtract_y_w_cry_reduced);
+                    //
+                endcase                
             //
-            case (wrk_fsm_state)
+            UOP_OPCODE_MODULAR_SUBTRACT_Z:
                 //
-                WRK_FSM_STATE_LATENCY_PRE2_M2: begin
-                    regadd_x_x_cry <= 1'b0;
-                    regadd_y_x_cry <= 1'b0;
-                    regadd_x_y_cry <= 1'b0;
-                    regadd_y_y_cry <= 1'b0;
-                end
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        begin update_wide_dout  (modular_subtract_x_mux_reduced, modular_subtract_x_mux_reduced, modular_subtract_y_mux_reduced, modular_subtract_y_mux_reduced);
+                              update_narrow_dout(modular_subtract_x_mux_reduced, modular_subtract_x_mux_reduced, modular_subtract_y_mux_reduced, modular_subtract_y_mux_reduced); end
+                    // 
+                endcase
+            //
+            UOP_OPCODE_MERGE_LH:
                 //
-                WRK_FSM_STATE_BUSY_M2,
-                WRK_FSM_STATE_LATENCY_POST1_M2: begin
-                    regadd_x_x_cry <= regadd_x_x[WORD_W];
-                    regadd_y_x_cry <= regadd_y_x[WORD_W];
-                    regadd_x_y_cry <= regadd_x_y[WORD_W];
-                    regadd_y_y_cry <= regadd_y_y[WORD_W];
-                end
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_narrow_dout(rd_wide_x_din_x_dly2, rd_wide_y_din_x_dly2, rd_wide_x_din_y_dly2, rd_wide_y_din_y_dly2);
+                        // 
+                endcase
+            //
+            UOP_OPCODE_REGULAR_ADD_UNEVEN:
                 //
-            endcase
-        //     
+                case (wrk_fsm_state)
+                    //
+                    WRK_FSM_STATE_BUSY1,
+                    WRK_FSM_STATE_LATENCY_POST1,
+                    WRK_FSM_STATE_LATENCY_POST3:
+                        //
+                        update_narrow_dout(regular_add_uneven_x_x_w_cry_reduced, regular_add_uneven_y_x_w_cry_reduced, regular_add_uneven_x_y_w_cry_reduced, regular_add_uneven_y_y_w_cry_reduced);
+                    // 
+                endcase
+        endcase
+        //
     end
 
+
 endmodule



More information about the Commits mailing list