[Cryptech-Commits] [user/shatov/modexpng] 01/13: Refactored general worker module Added modular subtraction micro-operation

git at cryptech.is git at cryptech.is
Wed Oct 23 16:22:00 UTC 2019


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 1e3303286bdb0d400d78d9d8b0aa90b29949c4a3
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Oct 21 12:44:33 2019 +0300

    Refactored general worker module
    Added modular subtraction micro-operation
---
 bench/tb_core_full.v          |    2 +-
 bench/tb_square.v             | 1379 -----------------------------------------
 rtl/modexpng_core_top.v       |   20 +-
 rtl/modexpng_general_worker.v | 1180 +++++++++++++++++++++--------------
 rtl/modexpng_microcode.vh     |   12 +-
 rtl/modexpng_uop_rom.v        |   26 +-
 6 files changed, 768 insertions(+), 1851 deletions(-)

diff --git a/bench/tb_core_full.v b/bench/tb_core_full.v
index 248634e..e592ac5 100644
--- a/bench/tb_core_full.v
+++ b/bench/tb_core_full.v
@@ -274,7 +274,7 @@ module tb_core_full;
         word_index_last_pq = CORE_NUM_WORDS_PQ - 1;
         
         bit_index_last_n  = TB_MODULUS_LENGTH_N - 1;
-        bit_index_last_pq = TB_MODULUS_LENGTH_N / 2 - 1;
+        bit_index_last_pq = 9;//TB_MODULUS_LENGTH_N / 2 - 1;
         
         core_crt_mode      = 1'b1;
         
diff --git a/bench/tb_square.v b/bench/tb_square.v
deleted file mode 100644
index 733e741..0000000
--- a/bench/tb_square.v
+++ /dev/null
@@ -1,1379 +0,0 @@
-`timescale 1ns / 1ps
-
-module tb_square;
-
-
-    //
-    // Headers
-    //
-    `include "../rtl/modexpng_parameters.vh"
-    `include "../rtl/modexpng_parameters_x8.vh"
-    `include "../rtl/modexpng_mmm_fsm.vh"
-
-
-    //
-    // Clock
-    //
-    `define CLK_FREQUENCY_MHZ   100.0
-    `define CLK_PERIOD_NS       (1000.0 / `CLK_FREQUENCY_MHZ)
-    `define CLK_PERIOD_HALF_NS  (0.5 * `CLK_PERIOD_NS)
-    
-	reg clk = 1'b0;
-
-    always begin
-        #`CLK_PERIOD_HALF_NS clk = 1'b1;
-        #`CLK_PERIOD_HALF_NS clk = 1'b0;
-    end
-    
-    
-    //
-    // Reset
-    //
-    reg rst = 1'b1;
-    
-    
-
-    //
-    // T1, T2
-    //
-    reg [17:0] T1[0:31];
-    reg [17:0] T2[0:31];
-    reg [17:0] AB[0:63];
-    reg [17:0] N_COEFF[0:32];
-    reg [17:0] Q[0:32];
-    reg [17:0] N[0:31];
-    reg [17:0] M[0:64];
-
-
-    //
-    // Init
-    //
-    initial begin
-        //
-        T1[ 0] = 18'h191c5; T1[ 1] = 18'h1a118; T1[ 2] = 18'h06e06; T1[ 3] = 18'h0ea69;
-        T1[ 4] = 18'h12944; T1[ 5] = 18'h0c242; T1[ 6] = 18'h0fc64; T1[ 7] = 18'h14efe;
-        T1[ 8] = 18'h113da; T1[ 9] = 18'h06ff7; T1[10] = 18'h0ef0d; T1[11] = 18'h18581;
-        T1[12] = 18'h1a62c; T1[13] = 18'h052b7; T1[14] = 18'h114f7; T1[15] = 18'h1c53e;
-        T1[16] = 18'h0c63e; T1[17] = 18'h0dd14; T1[18] = 18'h0fba8; T1[19] = 18'h1b8e6;
-        T1[20] = 18'h0d944; T1[21] = 18'h10292; T1[22] = 18'h0d276; T1[23] = 18'h027b1;
-        T1[24] = 18'h0c0c7; T1[25] = 18'h100a9; T1[26] = 18'h0a9ab; T1[27] = 18'h0e696;
-        T1[28] = 18'h10798; T1[29] = 18'h0ae91; T1[30] = 18'h08d4d; T1[31] = 18'h0080b;
-        //
-        T2[ 0] = 18'h1193b; T2[ 1] = 18'h0de9c; T2[ 2] = 18'h0b993; T2[ 3] = 18'h0d2cd;
-        T2[ 4] = 18'h106ad; T2[ 5] = 18'h076da; T2[ 6] = 18'h10cab; T2[ 7] = 18'h15cd5;
-        T2[ 8] = 18'h15425; T2[ 9] = 18'h16287; T2[10] = 18'h0fd64; T2[11] = 18'h06ee0;
-        T2[12] = 18'h1b0c9; T2[13] = 18'h01a5e; T2[14] = 18'h1855c; T2[15] = 18'h17bf9;
-        T2[16] = 18'h1c83c; T2[17] = 18'h158ed; T2[18] = 18'h086df; T2[19] = 18'h16676;
-        T2[20] = 18'h0a0f8; T2[21] = 18'h14545; T2[22] = 18'h09641; T2[23] = 18'h16863;
-        T2[24] = 18'h17e20; T2[25] = 18'h0d457; T2[26] = 18'h05a9b; T2[27] = 18'h1a4cf;
-        T2[28] = 18'h1582a; T2[29] = 18'h1686c; T2[30] = 18'h1394e; T2[31] = 18'h0bdbc;
-        //
-        N[ 0] = 18'h00f97; N[ 1] = 18'h018bb; N[ 2] = 18'h08a44; N[ 3] = 18'h00858;
-        N[ 4] = 18'h06647; N[ 5] = 18'h0042c; N[ 6] = 18'h0fa09; N[ 7] = 18'h0c8d3;
-        N[ 8] = 18'h0bbc7; N[ 9] = 18'h0e2dd; N[10] = 18'h017fd; N[11] = 18'h0ef4a;
-        N[12] = 18'h002ef; N[13] = 18'h090c1; N[14] = 18'h032db; N[15] = 18'h028b1;
-        N[16] = 18'h05f0a; N[17] = 18'h0ebfd; N[18] = 18'h017ca; N[19] = 18'h09587;
-        N[20] = 18'h0d266; N[21] = 18'h0563c; N[22] = 18'h041af; N[23] = 18'h0433f;
-        N[24] = 18'h08e83; N[25] = 18'h0bc19; N[26] = 18'h000b2; N[27] = 18'h05b53;
-        N[28] = 18'h00e5d; N[29] = 18'h09bc5; N[30] = 18'h0a822; N[31] = 18'h0efff;
-        //
-        N_COEFF[ 0] = 18'h09fd9; N_COEFF[ 1] = 18'h0b367; N_COEFF[ 2] = 18'h0e467; N_COEFF[ 3] = 18'h0de24;
-        N_COEFF[ 4] = 18'h02022; N_COEFF[ 5] = 18'h0f0e8; N_COEFF[ 6] = 18'h02919; N_COEFF[ 7] = 18'h09901;
-        N_COEFF[ 8] = 18'h0da43; N_COEFF[ 9] = 18'h0023b; N_COEFF[10] = 18'h0ebf8; N_COEFF[11] = 18'h0f04e;
-        N_COEFF[12] = 18'h0942f; N_COEFF[13] = 18'h029e9; N_COEFF[14] = 18'h07cb0; N_COEFF[15] = 18'h08c25;
-        N_COEFF[16] = 18'h04e60; N_COEFF[17] = 18'h05cdc; N_COEFF[18] = 18'h0dff7; N_COEFF[19] = 18'h0279b;
-        N_COEFF[20] = 18'h0610d; N_COEFF[21] = 18'h0f04a; N_COEFF[22] = 18'h001dc; N_COEFF[23] = 18'h03429;
-        N_COEFF[24] = 18'h0f78c; N_COEFF[25] = 18'h0c3e2; N_COEFF[26] = 18'h00ed8; N_COEFF[27] = 18'h039c0;
-        N_COEFF[28] = 18'h02ac2; N_COEFF[29] = 18'h0f703; N_COEFF[30] = 18'h0c54e; N_COEFF[31] = 18'h022d9;
-        N_COEFF[32] = 18'h0f994;
-        //
-        AB[ 0] = 18'h0c199; AB[ 1] = 18'h0957a; AB[ 2] = 18'h070ad; AB[ 3] = 18'h0e5a6;
-        AB[ 4] = 18'h0fec9; AB[ 5] = 18'h00b73; AB[ 6] = 18'h09c72; AB[ 7] = 18'h0cdf0;
-        AB[ 8] = 18'h08755; AB[ 9] = 18'h07560; AB[10] = 18'h084b1; AB[11] = 18'h0ad3f;
-        AB[12] = 18'h074fe; AB[13] = 18'h04d74; AB[14] = 18'h00e16; AB[15] = 18'h0d3b3;
-        AB[16] = 18'h0d418; AB[17] = 18'h02f12; AB[18] = 18'h0c301; AB[19] = 18'h0be2b;
-        AB[20] = 18'h08222; AB[21] = 18'h0056c; AB[22] = 18'h01c7c; AB[23] = 18'h0bc95;
-        AB[24] = 18'h03427; AB[25] = 18'h0c65a; AB[26] = 18'h089ac; AB[27] = 18'h02117;
-        AB[28] = 18'h0ff7d; AB[29] = 18'h01cde; AB[30] = 18'h02709; AB[31] = 18'h01c56;
-        AB[32] = 18'h0f35a; AB[33] = 18'h08ce6; AB[34] = 18'h0a8e5; AB[35] = 18'h0d6d4;
-        AB[36] = 18'h06868; AB[37] = 18'h09105; AB[38] = 18'h0219e; AB[39] = 18'h0bc40;
-        AB[40] = 18'h00e0a; AB[41] = 18'h07783; AB[42] = 18'h0187a; AB[43] = 18'h0b922;
-        AB[44] = 18'h02609; AB[45] = 18'h0c64b; AB[46] = 18'h06b4b; AB[47] = 18'h04b79;
-        AB[48] = 18'h0fed6; AB[49] = 18'h03eac; AB[50] = 18'h04cac; AB[51] = 18'h0d47d;
-        AB[52] = 18'h045fd; AB[53] = 18'h04fa8; AB[54] = 18'h0597c; AB[55] = 18'h0a10d;
-        AB[56] = 18'h0bf44; AB[57] = 18'h08671; AB[58] = 18'h0112a; AB[59] = 18'h08ccf;
-        AB[60] = 18'h0cae5; AB[61] = 18'h04d94; AB[62] = 18'h0b95a; AB[63] = 18'h00040;
-        //
-        Q[ 0] = 18'h021b1; Q[ 1] = 18'h0d2db; Q[ 2] = 18'h0754b; Q[ 3] = 18'h01fc1;
-        Q[ 4] = 18'h063f7; Q[ 5] = 18'h086e5; Q[ 6] = 18'h0bcea; Q[ 7] = 18'h02260;
-        Q[ 8] = 18'h0c54c; Q[ 9] = 18'h0e298; Q[10] = 18'h05d07; Q[11] = 18'h0f978;
-        Q[12] = 18'h0e742; Q[13] = 18'h0a3f0; Q[14] = 18'h0b31e; Q[15] = 18'h041b7;
-        Q[16] = 18'h06ed9; Q[17] = 18'h03ac5; Q[18] = 18'h0f8eb; Q[19] = 18'h0c619;
-        Q[20] = 18'h067e9; Q[21] = 18'h00350; Q[22] = 18'h00376; Q[23] = 18'h02ebf;
-        Q[24] = 18'h0b125; Q[25] = 18'h05f7d; Q[26] = 18'h0f121; Q[27] = 18'h07ba4;
-        Q[28] = 18'h03050; Q[29] = 18'h0642e; Q[30] = 18'h0c2fc; Q[31] = 18'h0dfcf;
-        Q[32] = 18'h03f9e;
-        //
-        M[ 0] = 18'h03e67; M[ 1] = 18'h06a85; M[ 2] = 18'h08f52; M[ 3] = 18'h01a59;
-        M[ 4] = 18'h00136; M[ 5] = 18'h0f48c; M[ 6] = 18'h0638d; M[ 7] = 18'h0320f;
-        M[ 8] = 18'h078aa; M[ 9] = 18'h08a9f; M[10] = 18'h07b4e; M[11] = 18'h052c0;
-        M[12] = 18'h08b01; M[13] = 18'h0b28b; M[14] = 18'h0f1e9; M[15] = 18'h02c4c;
-        M[16] = 18'h02be7; M[17] = 18'h0d0ed; M[18] = 18'h03cfe; M[19] = 18'h041d4;
-        M[20] = 18'h07ddd; M[21] = 18'h0fa93; M[22] = 18'h0e383; M[23] = 18'h0436a;
-        M[24] = 18'h0cbd8; M[25] = 18'h039a5; M[26] = 18'h07653; M[27] = 18'h0dee8;
-        M[28] = 18'h00082; M[29] = 18'h0e321; M[30] = 18'h0d8f6; M[31] = 18'h0e3a9;
-        M[32] = 18'h00ca5; M[33] = 18'h035ed; M[34] = 18'h02b8f; M[35] = 18'h063bd;
-        M[36] = 18'h0ec9f; M[37] = 18'h0b8bb; M[38] = 18'h00389; M[39] = 18'h0ca27;
-        M[40] = 18'h0bea7; M[41] = 18'h0df1e; M[42] = 18'h0d685; M[43] = 18'h0cc1b;
-        M[44] = 18'h036c4; M[45] = 18'h01ce9; M[46] = 18'h0c43b; M[47] = 18'h05f58;
-        M[48] = 18'h02c77; M[49] = 18'h03a12; M[50] = 18'h0eea8; M[51] = 18'h0ac31;
-        M[52] = 18'h05838; M[53] = 18'h093ac; M[54] = 18'h0fd54; M[55] = 18'h06e13;
-        M[56] = 18'h002e2; M[57] = 18'h06af4; M[58] = 18'h0ea18; M[59] = 18'h083b3;
-        M[60] = 18'h059f7; M[61] = 18'h016d3; M[62] = 18'h0c3ad; M[63] = 18'h0dbfc;
-        M[64] = 18'h03ba4;
-        //  
-    end
-    
-
-    //
-    // BRAMs
-    //
-    reg        tb_fat_bram_xy_ena = 1'b0;
-    reg [ 2:0] tb_fat_bram_xy_bank;
-    reg [ 7:0] tb_fat_bram_xy_addr;
-    reg [17:0] tb_fat_bram_x_din;
-    reg [17:0] tb_fat_bram_y_din;
-
-    reg        mgr_fat_bram_xy_ena = 1'b0;
-    reg [ 2:0] mgr_fat_bram_xy_bank;
-    reg [ 7:0] mgr_fat_bram_xy_addr;
-    reg [17:0] mgr_fat_bram_x_din;
-    reg [17:0] mgr_fat_bram_y_din;
-    
-    reg         mac_fat_bram_xy_ena = 1'b0;
-    reg         mac_fat_bram_xy_ena_aux = 1'b0;
-    reg         mac_fat_bram_xy_reg_ena = 1'b0;
-    reg         mac_fat_bram_xy_reg_ena_aux = 1'b0;
-    reg  [ 2:0] mac_fat_bram_xy_bank;
-    reg  [ 2:0] mac_fat_bram_xy_bank_aux;
-    reg  [ 7:0] mac_fat_bram_xy_addr[0:4];
-    wire [17:0] mac_fat_bram_x_dout[0:4];
-    wire [17:0] mac_fat_bram_y_dout[0:4];
-    wire [ 7:0] mac_fat_bram_xy_addr_aux = mac_fat_bram_xy_addr[4]; // handy for debug
-    wire [17:0] mac_fat_bram_x_dout_aux = mac_fat_bram_x_dout[4];   // handy for debug
-    wire [17:0] mac_fat_bram_y_dout_aux = mac_fat_bram_x_dout[4];   // handy for debug
-    
-    reg        tb_slim_bram_xy_ena = 1'b0;
-    reg [ 1:0] tb_slim_bram_xy_bank;
-    reg [ 7:0] tb_slim_bram_xy_addr;
-    reg [17:0] tb_slim_bram_x_din;
-    reg [17:0] tb_slim_bram_y_din;
-    
-    reg        mgr_slim_bram_xy_ena = 1'b0;
-    reg [ 1:0] mgr_slim_bram_xy_bank;
-    reg [ 7:0] mgr_slim_bram_xy_addr;
-    reg [17:0] mgr_slim_bram_x_din;
-    reg [17:0] mgr_slim_bram_y_din;
-
-    reg         mac_slim_bram_xy_ena = 1'b0;
-    reg         mac_slim_bram_xy_reg_ena = 1'b0;
-    reg  [ 1:0] mac_slim_bram_xy_bank;
-    reg  [ 7:0] mac_slim_bram_xy_addr;
-    reg  [ 7:0] mac_slim_bram_xy_addr_dly;
-    wire [17:0] mac_slim_bram_x_dout;
-    wire [17:0] mac_slim_bram_y_dout;
-    
-    always @(posedge clk)
-        //
-        mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr;
-    
-    reg mac_slim_bram_xy_reg_ena_dly = 1'b0;
-    always @(posedge clk)
-        mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena;
-    
-    
-    
-    genvar z;
-    generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1)
-        begin : gen_fat_bram
-            //
-            ip_bram_36k fat_bram_x
-            (
-                .clka   (clk),
-                .ena    (mgr_fat_bram_xy_ena),
-                .wea    (mgr_fat_bram_xy_ena),
-                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
-                .dina   (mgr_fat_bram_x_din),
-            
-                .clkb   (clk),
-                .enb    (z < (NUM_MULTS/2) ? mac_fat_bram_xy_ena     : mac_fat_bram_xy_ena_aux),
-                .regceb (z < (NUM_MULTS/2) ? mac_fat_bram_xy_reg_ena : mac_fat_bram_xy_reg_ena_aux),
-                .addrb  ({(z < (NUM_MULTS/2) ?
-                    mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux), mac_fat_bram_xy_addr[z]}),
-                .doutb  (mac_fat_bram_x_dout[z])
-            );
-            //
-            ip_bram_36k fat_bram_y
-            (
-                .clka   (clk),
-                .ena    (mgr_fat_bram_xy_ena),
-                .wea    (mgr_fat_bram_xy_ena),
-                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
-                .dina   (mgr_fat_bram_y_din),
-            
-                .clkb   (clk),
-                .enb    (z < (NUM_MULTS/2) ? mac_fat_bram_xy_ena     : mac_fat_bram_xy_ena_aux),
-                .regceb (z < (NUM_MULTS/2) ? mac_fat_bram_xy_reg_ena : mac_fat_bram_xy_reg_ena_aux),
-                .addrb  ({z < (NUM_MULTS/2) ?
-                    mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_addr[z]}),
-                .doutb  (mac_fat_bram_y_dout[z])
-            );
-            //
-        end
-    endgenerate
-
-    ip_bram_18k slim_bram_x
-    (
-        .clka   (clk),
-        .ena    (mgr_slim_bram_xy_ena),
-        .wea    (mgr_slim_bram_xy_ena),
-        .addra  ({mgr_slim_bram_xy_bank, mgr_slim_bram_xy_addr}),
-        .dina   (mgr_slim_bram_x_din),
-    
-        .clkb   (clk),
-        .enb    (mac_slim_bram_xy_ena),
-        .regceb (mac_slim_bram_xy_reg_ena),
-        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
-        .doutb  (mac_slim_bram_x_dout)
-    );
-
-    ip_bram_18k slim_bram_y
-    (
-        .clka   (clk),
-        .ena    (mgr_slim_bram_xy_ena),
-        .wea    (mgr_slim_bram_xy_ena),
-        .addra  ({mgr_slim_bram_xy_bank, mgr_slim_bram_xy_addr}),
-        .dina   (mgr_slim_bram_y_din),
-    
-        .clkb   (clk),
-        .enb    (mac_slim_bram_xy_ena),
-        .regceb (mac_slim_bram_xy_reg_ena),
-        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
-        .doutb  (mac_slim_bram_y_dout)
-    );
-    
-    
-    
-    //
-    // Enable, Ready
-    //
-    reg ena = 1'b0;
-
-    integer i;
-    initial begin
-
-        for (i=0; i<10; i=i+1)
-            wait_clock_tick;
-        
-        rst = 1'b0;
-
-        for (i=0; i<10; i=i+1)
-            wait_clock_tick;
-        
-        tb_fat_bram_xy_ena = 1'b1;
-        tb_slim_bram_xy_ena = 1'b1;
-
-        for (i=0; i<32; i=i+1) begin
-            tb_fat_bram_xy_bank = BANK_FAT_T1T2;
-            tb_fat_bram_xy_addr = i[7:0];
-            tb_fat_bram_x_din = T1[i];
-            tb_fat_bram_y_din = T2[i];
-            
-            tb_slim_bram_xy_bank = BANK_SLIM_T1T2;
-            tb_slim_bram_xy_addr = i[7:0];
-            tb_slim_bram_x_din = T1[i];
-            tb_slim_bram_y_din = T2[i];
-            
-            wait_clock_tick;
-        end
-
-        for (i=0; i<32; i=i+1) begin
-            tb_slim_bram_xy_bank = BANK_SLIM_N_COEFF;
-            tb_slim_bram_xy_addr = i[7:0];
-            tb_slim_bram_x_din = N_COEFF[i];
-            tb_slim_bram_y_din = N_COEFF[i];
-            
-            wait_clock_tick;
-        end
-        for (i=32; i<33; i=i+1) begin
-            tb_slim_bram_xy_bank = BANK_SLIM_EXT;
-            tb_slim_bram_xy_addr = 0;   // !
-            tb_slim_bram_x_din = N_COEFF[i];
-            tb_slim_bram_y_din = N_COEFF[i];
-            
-            wait_clock_tick;
-        end
-
-        for (i=0; i<32; i=i+1) begin
-            tb_fat_bram_xy_bank = BANK_FAT_N;
-            tb_fat_bram_xy_addr = i[7:0];
-            tb_fat_bram_x_din = N[i];
-            tb_fat_bram_y_din = N[i];
-            
-            wait_clock_tick;
-        end
-
-        tb_fat_bram_xy_ena = 1'b0;        
-        tb_slim_bram_xy_ena = 1'b0;
-        
-        tb_fat_bram_xy_bank = {3{1'bX}};
-        tb_fat_bram_xy_addr = {8{1'bX}};
-        tb_fat_bram_x_din = {18{1'bX}};
-        tb_fat_bram_y_din = {18{1'bX}};
-
-        tb_slim_bram_xy_bank = {2{1'bX}};
-        tb_slim_bram_xy_addr = {8{1'bX}};
-        tb_slim_bram_x_din = {18{1'bX}};
-        tb_slim_bram_y_din = {18{1'bX}};
-
-        for (i=0; i<10; i=i+1)
-            wait_clock_tick;
-            
-        ena = 1'b1;
-        wait_clock_tick;
-        ena = 1'b0;
-    
-        for (i=0; i<10000; i=i+1)
-            wait_clock_tick;
-            
-        verify_ab;
-        verify_q;
-        verify_m;
-
-    end
-
-    
-    //
-    // DSPs
-    //
-    reg             dsp_x_ce_a;
-    reg             dsp_x_ce_b;
-    reg             dsp_x_ce_b_dly;
-    reg             dsp_x_ce_m;
-    reg             dsp_x_ce_p;
-    reg             dsp_x_ce_mode;
-    
-    reg  [9   -1:0] dsp_x_mode_z = {9{1'b1}};
-    
-    wire [5*18-1:0] dsp_x_a;
-    reg  [1*17-1:0] dsp_x_b;
-    wire [9*47-1:0] dsp_x_p;
-
-    reg             dsp_y_ce_a;
-    reg             dsp_y_ce_b;
-    reg             dsp_y_ce_b_dly;
-    reg             dsp_y_ce_m;
-    reg             dsp_y_ce_p;
-    reg             dsp_y_ce_mode;
-    
-    reg  [9   -1:0] dsp_y_mode_z = {9{1'b1}};
-        
-    wire [5*18-1:0] dsp_y_a;
-    reg  [1*17-1:0] dsp_y_b;
-    wire [9*47-1:0] dsp_y_p;
-        
-    generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1)
-        begin : gen_dsp_xy_a_split
-            assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z];
-            assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z];
-        end
-    endgenerate
-    
-    always @(posedge clk)
-        //
-        {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b};
-    
-
-    reg  [9   -1:0] dsp_xy_mode_z_adv1 = {9{1'b1}};
-    reg  [9   -1:0] dsp_xy_mode_z_adv2 = {9{1'b1}};
-    reg  [9   -1:0] dsp_xy_mode_z_adv3 = {9{1'b1}};
-    reg  [9   -1:0] dsp_xy_mode_z_adv4 = {9{1'b1}};
-    
-    dsp_array dsp_x
-    (
-        .clk            (clk),
-        
-        .ce_a           (dsp_x_ce_a),
-        .ce_b           (dsp_x_ce_b),
-        .ce_m           (dsp_x_ce_m),
-        .ce_p           (dsp_x_ce_p),
-        .ce_mode        (dsp_x_ce_mode),
-
-        .mode_z         (dsp_x_mode_z),
-        
-        .a              (dsp_x_a),
-        .b              (dsp_x_b),
-        .p              (dsp_x_p)
-    );
-
-    dsp_array dsp_y
-    (
-        .clk            (clk),
-        
-        .ce_a           (dsp_y_ce_a),
-        .ce_b           (dsp_y_ce_b),
-        .ce_m           (dsp_y_ce_m),
-        .ce_p           (dsp_y_ce_p),
-        .ce_mode        (dsp_y_ce_mode),
-
-        .mode_z         (dsp_y_mode_z),
-        
-        .a              (dsp_y_a),
-        .b              (dsp_y_b),
-        .p              (dsp_y_p)
-    );
-
-
-    //
-    // FSM State and Next States
-    //
-    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
-    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
-
-    
-    always @(posedge clk)
-        //
-        if (rst) fsm_state <= FSM_STATE_IDLE;
-        else     fsm_state <= fsm_state_next;
-
-
-    localparam [7:0] index_last = 8'd31;
-    localparam [7:0] index_last_minus1 = index_last - 1'b1;
-
-
-    //
-    // Column
-    //
-    reg  [4:0] col_index;       // current column index
-    reg  [4:0] col_index_prev;  // delayed column index value
-    reg  [4:0] col_index_last;  // index of the very last column
-    reg  [4:0] col_index_next1;  // precomputed next column index
-    //reg  [4:0] col_index_next2;  // precomputed next column index after next column index
-    reg        col_is_last;     // flag set during the very last column
-
-    always @(posedge clk)
-        //
-        col_index_prev <= col_index;
-    
-
-    wire mult_square_addr_almost_done_comb;
-    reg  mult_square_addr_almost_done_flop;
-    reg  mult_square_addr_surely_done_flop; 
-
-    wire  mult_triangle_addr_almost_done_comb;
-    reg  mult_triangle_addr_almost_done_flop;        
-    reg  mult_triangle_addr_surely_done_flop;
-    reg  mult_triangle_addr_tardy_done_flop;
-
-    wire  mult_rectangle_addr_almost_done_comb;
-    reg  mult_rectangle_addr_almost_done_flop;        
-    reg  mult_rectangle_addr_surely_done_flop;
-    reg  mult_rectangle_addr_tardy_done_flop;
-
-    
-    assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == index_last_minus1;
-    assign mult_triangle_addr_almost_done_comb = (mac_slim_bram_xy_addr[2:0] == index_last_minus1[2:0]) && (mac_slim_bram_xy_addr[7:3] == col_index);
-    assign mult_rectangle_addr_almost_done_comb = mac_slim_bram_xy_addr == index_last_minus1;
-
-            
-    
-    
-    always @(posedge clk)
-        //
-        case (fsm_state)
-        
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
-                mult_square_addr_almost_done_flop <= mult_square_addr_almost_done_comb;
-                //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 
-                //{mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb};
-            default:
-               mult_square_addr_almost_done_flop <= 1'b0;
-                //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00;
-            
-        endcase
-
-    always @(posedge clk)
-        //
-        mult_square_addr_surely_done_flop <= mult_square_addr_almost_done_flop;
-
-    always @(posedge clk)
-        //
-        case (fsm_state)
-        
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:
-                mult_triangle_addr_almost_done_flop <= mult_triangle_addr_almost_done_comb;
-                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 
-                //{mult_triangle_addr_surely_done_comb, mult_triangle_addr_almost_done_comb};
-                
-            default:
-                mult_triangle_addr_almost_done_flop <= 1'b0;
-                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 2'b00;
-            
-        endcase
-
-    always @(posedge clk) begin
-        //
-        mult_triangle_addr_surely_done_flop <= mult_triangle_addr_almost_done_flop;
-        mult_triangle_addr_tardy_done_flop  <= mult_triangle_addr_surely_done_flop;
-        //
-    end
-        
-        
-     always @(posedge clk)
-        //
-        case (fsm_state)
-        
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:
-                mult_rectangle_addr_almost_done_flop <= mult_rectangle_addr_almost_done_comb;
-                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 
-                //{mult_triangle_addr_surely_done_comb, mult_triangle_addr_almost_done_comb};
-                
-            default:
-                mult_rectangle_addr_almost_done_flop <= 1'b0;
-                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 2'b00;
-            
-        endcase
-
-    always @(posedge clk) begin
-        //
-        mult_rectangle_addr_surely_done_flop <= mult_rectangle_addr_almost_done_flop;
-        mult_rectangle_addr_tardy_done_flop  <= mult_rectangle_addr_surely_done_flop;
-        //
-    end
-
-
-    //
-    // Recombinator Interface
-    //
-    wire [ 2:0] recomb_fat_bram_xy_bank;
-    wire [ 7:0] recomb_fat_bram_xy_addr;
-    wire [17:0] recomb_fat_bram_x_dout;
-    wire [17:0] recomb_fat_bram_y_dout;
-    wire        recomb_fat_bram_xy_dout_valid;
-    wire [ 2:0] recomb_slim_bram_xy_bank;
-    wire [ 7:0] recomb_slim_bram_xy_addr;
-    wire [17:0] recomb_slim_bram_x_dout;
-    wire [17:0] recomb_slim_bram_y_dout;
-    wire        recomb_slim_bram_xy_dout_valid;
-    wire        recomb_rdy;
-    
-    
-
-
-    //
-    // FSM Transition Logic
-    //
-    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
-    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_triangle;
-    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_rectangle;
-    
-    
-    //
-    // Slim - Address
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_slim_bram_xy_addr <= 8'd0;
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0;
-            //
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_addr <= mult_triangle_addr_almost_done_flop || (col_is_last && mult_triangle_addr_surely_done_flop) ?
-                8'd0 :  mac_slim_bram_xy_addr + 1'b1;
-            //
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: mac_slim_bram_xy_addr <= mult_rectangle_addr_almost_done_flop || mult_rectangle_addr_surely_done_flop ?
-                8'd1 :  mac_slim_bram_xy_addr + 1'b1;            
-            //
-            default:                            mac_slim_bram_xy_addr <= 8'dX;
-        endcase
-
-
-    wire [2:0] fat_bram_offset_rom[0:3];
-    
-    generate for (z=1; z<NUM_MULTS; z=z+2)
-        begin : gen_fat_bram_offset
-            assign fat_bram_offset_rom[(z-1)/2] = z[2:0];
-        end
-    endgenerate    
-
-    //
-    // Fat - Address
-    //
-    integer j;
-    always @(posedge clk) begin
-        //
-        for (j=0; j<(NUM_MULTS/2); j=j+1)
-            //
-            case (fsm_state_next)
-                //
-                // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
-                //
-                FSM_STATE_MULT_SQUARE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
-                //
-                FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
-                //
-                FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_RECTANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
-                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
-                //
-                default:                            mac_fat_bram_xy_addr[j] <= 8'dX;
-            endcase
-            //
-        case (fsm_state_next)
-            //
-            // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
-            FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last);
-            //
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last);
-            //
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= 8'dX;//{5'd0, 3'd0};
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
-            FSM_STATE_MULT_RECTANGLE_HOLDOFF:    mac_fat_bram_xy_addr[4] <= recomb_fat_bram_xy_dout_valid ? recomb_fat_bram_xy_addr : 8'dX;//recomb_fat_bram_xy_dout_valid && (recomb_fat_bram_xy_bank == BANK_FAT_ML) ?
-                //mac_fat_bram_xy_addr[4] + 1'b1 : mac_fat_bram_xy_addr[4];
-            //
-            default:                            mac_fat_bram_xy_addr[4] <= 8'dX;
-        endcase
-//
-    end
-
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_bank <= BANK_SLIM_T1T2;
-            //
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_bank <= col_is_last && (mult_triangle_addr_almost_done_flop || mult_triangle_addr_surely_done_flop) ?
-                BANK_SLIM_EXT : BANK_SLIM_N_COEFF;
-            //
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: mac_slim_bram_xy_bank <= mult_rectangle_addr_almost_done_flop || mult_rectangle_addr_surely_done_flop ?
-                BANK_SLIM_EXT : BANK_SLIM_Q;            
-            //
-            default:                            mac_slim_bram_xy_bank <= 2'bXX;
-        endcase
-
-    always @(posedge clk) begin
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:    mac_fat_bram_xy_bank <= BANK_FAT_T1T2;
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:  mac_fat_bram_xy_bank <= BANK_FAT_ABL;
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_bank <= BANK_FAT_ABL;
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_bank <= BANK_FAT_N;            
-            default:                             mac_fat_bram_xy_bank <= 3'bXXX;
-        endcase
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_bank_aux <= BANK_FAT_T1T2;
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: mac_fat_bram_xy_bank_aux <= BANK_FAT_ABH;
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_bank_aux <= BANK_FAT_ABL;
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
-            FSM_STATE_MULT_RECTANGLE_HOLDOFF:   if (recomb_fat_bram_xy_dout_valid)
-                case (recomb_fat_bram_xy_bank)
-                    BANK_FAT_ML: mac_fat_bram_xy_bank_aux <= BANK_FAT_ABL;
-                    BANK_FAT_MH: mac_fat_bram_xy_bank_aux <= BANK_FAT_ABH;
-                    BANK_FAT_EXT: mac_fat_bram_xy_bank_aux <= BANK_FAT_EXT;
-                    default: mac_fat_bram_xy_bank_aux <= 3'bXXX; 
-                 endcase
-                 else mac_fat_bram_xy_bank_aux <= 3'bXXX;
-            default:                            mac_fat_bram_xy_bank_aux <= 3'bXXX;
-        endcase
-        //
-    end
-
-
-
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop;
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_slim_bram_xy_ena <= !col_is_last ? ~mult_triangle_addr_almost_done_flop : ~mult_triangle_addr_surely_done_flop; 
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_slim_bram_xy_ena <= ~mult_rectangle_addr_surely_done_flop;
-            default:                              mac_slim_bram_xy_ena <= 1'b0;
-        endcase
-
-    always @(posedge clk) begin
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_ena <= 1'b1;
-            default:                            mac_fat_bram_xy_ena <= 1'b0;
-        endcase
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_fat_bram_xy_ena_aux <= 1'b1;
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: mac_fat_bram_xy_ena_aux <= 1'b0;//1'b1;
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
-            FSM_STATE_MULT_RECTANGLE_HOLDOFF:   mac_fat_bram_xy_ena_aux <= recomb_fat_bram_xy_dout_valid;// && (recomb_fat_bram_xy_bank == BANK_FAT_ML);
-            default:                            mac_fat_bram_xy_ena_aux <= 1'b0;
-        endcase
-        //
-    end
-
-    always @(posedge clk)
-        //
-        mac_slim_bram_xy_reg_ena <= mac_slim_bram_xy_ena;
-        
-    always @(posedge clk)
-        //
-        {mac_fat_bram_xy_reg_ena_aux, mac_fat_bram_xy_reg_ena} <= {mac_fat_bram_xy_ena_aux, mac_fat_bram_xy_ena};
-          
-    reg ladder_mode = 1'b1; // 0 = X:T1*T2, Y:T2*T2
-                            // 1 = X:T1*T2, Y:T2*T1
-          
-
-    reg dsp_swap_xy;
-    
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG:   dsp_swap_xy <= 1'b1;
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: dsp_swap_xy <= 1'b0;
-        endcase
-  
-    always @(posedge clk)
-        //
-        if (mac_slim_bram_xy_reg_ena_dly) begin // rewrite
-            if (!dsp_swap_xy)
-                {dsp_y_b, dsp_x_b} <= {mac_slim_bram_y_dout[16:0], mac_slim_bram_x_dout[16:0]};
-            else begin
-                if (!ladder_mode) {dsp_y_b, dsp_x_b} <= {mac_slim_bram_x_dout[16:0], mac_slim_bram_y_dout[16:0]};
-                else              {dsp_y_b, dsp_x_b} <= {mac_slim_bram_y_dout[16:0], mac_slim_bram_x_dout[16:0]};
-            end
-        end
-        else
-            {dsp_y_b, dsp_x_b} <= {2{{17{1'bX}}}};
-
-
-    function  [7:0] mac_fat_bram_xy_addr_next;
-        input [7:0] mac_fat_bram_xy_addr_current;
-        input [7:0] mac_fat_bram_xy_addr_last;
-        begin
-            if (mac_fat_bram_xy_addr_current > 8'd0)
-                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1;
-            else
-                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last;
-        end
-    endfunction
-        
-
-    
-    always @(posedge clk)
-        //
-        {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}};
-        
-    always @(posedge clk)
-        //
-        {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}};
-    
-    always @(posedge clk)
-        //
-        {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
-
-    always @(posedge clk)
-        //
-        {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m};
-        
-    always @(posedge clk)
-        //
-        {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
-
-    task wait_clock_tick;
-        begin
-            #`CLK_PERIOD_NS;
-        end
-    endtask
-    
-    //
-    // Increment Logic
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: begin
-                col_index       <= 5'd0;
-                col_index_last  <= index_last[7:3];
-                col_index_next1 <= 5'd1;
-                //col_index_next2 <= 5'd2;
-                col_is_last     <= 1'b0;
-                
-            end
-            //
-            FSM_STATE_MULT_SQUARE_COL_N_INIT,
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: begin
-                col_index <= col_index_next1;
-                col_is_last <= col_index_next1 == col_index_last;
-                col_index_next1 <= col_index_next1 == col_index_last ? 5'd0 : col_index_next1 + 5'd1;   
-                //col_index_next2 <= col_index_next2 + 1'b1;
-            end
-            //
-        endcase
-    
-    assign fsm_state_after_mult_square    = col_is_last ? FSM_STATE_MULT_SQUARE_HOLDOFF   : FSM_STATE_MULT_SQUARE_COL_N_INIT;
-    assign fsm_state_after_mult_triangle  = col_is_last ? FSM_STATE_MULT_TRIANGLE_HOLDOFF : FSM_STATE_MULT_TRIANGLE_COL_N_INIT;
-    assign fsm_state_after_mult_rectangle = col_is_last ? FSM_STATE_MULT_RECTANGLE_HOLDOFF : FSM_STATE_MULT_RECTANGLE_COL_N_INIT;
-    
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG:       dsp_xy_mode_z_adv4 <= {9{1'b0}};
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:       dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly);
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= {9{1'b1}};
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= calc_mac_mode_z_rectangle(col_index_prev, mac_slim_bram_xy_addr_dly);
-            default:                                dsp_xy_mode_z_adv4 <= {9{1'b1}};
-        endcase
-
-    always @(posedge clk) begin
-        {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}};
-        //
-        dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2};
-        dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3};
-        dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4};
-    end
-    
-    function  [NUM_MULTS:0] calc_mac_mode_z_square;
-        input [        4:0] col_index_value;
-        input [        7:0] mac_slim_bram_xy_addr_value;
-        begin
-            if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
-                case (mac_slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110};
-                    3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101};
-                    3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011};
-                    3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111};
-                    3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111};
-                    3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111};
-                    3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111};
-                    3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111};
-                endcase
-            else
-                calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}};
-        end
-    endfunction
-    
-    function  [NUM_MULTS:0] calc_mac_mode_z_rectangle;
-        input [        4:0] col_index_value;
-        input [        7:0] mac_slim_bram_xy_addr_value;
-        begin
-            if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
-                case (mac_slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_mac_mode_z_rectangle = {1'b1, 8'b11111110};
-                    3'b001: calc_mac_mode_z_rectangle = {1'b1, 8'b11111101};
-                    3'b010: calc_mac_mode_z_rectangle = {1'b1, 8'b11111011};
-                    3'b011: calc_mac_mode_z_rectangle = {1'b1, 8'b11110111};
-                    3'b100: calc_mac_mode_z_rectangle = {1'b1, 8'b11101111};
-                    3'b101: calc_mac_mode_z_rectangle = {1'b1, 8'b11011111};
-                    3'b110: calc_mac_mode_z_rectangle = {1'b1, 8'b10111111};
-                    3'b111: calc_mac_mode_z_rectangle = {1'b1, 8'b01111111};
-                endcase
-            else
-                calc_mac_mode_z_rectangle = {1'b1, {NUM_MULTS{1'b1}}};
-        end
-    endfunction
-
-    reg recomb_x_ena = 1'b0;
-    reg recomb_y_ena = 1'b0;
-    
-    always @(posedge clk) begin
-        //
-        recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p;
-        recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p;
-        //
-    end
-    
-    modexpng_part_recombinator recomb
-    (
-        .clk                            (clk),
-        .rdy                            (recomb_rdy),
-        .fsm_state_next                 (fsm_state_next),
-        .index_last                     (index_last),
-        .dsp_x_ce_p                     (dsp_x_ce_p),
-        .dsp_y_ce_p                     (dsp_y_ce_p),
-        .ena_x                          (recomb_x_ena),
-        .ena_y                          (recomb_y_ena),
-        .dsp_x_p                        (dsp_x_p),
-        .dsp_y_p                        (dsp_y_p),
-        .col_index                      (col_index),
-        .col_index_last                 (col_index_last),
-        .slim_bram_xy_addr              (mac_slim_bram_xy_addr),
-        .slim_bram_xy_bank              (mac_slim_bram_xy_bank),
-        .rcmb_fat_bram_xy_bank          (recomb_fat_bram_xy_bank),
-        .rcmb_fat_bram_xy_addr          (recomb_fat_bram_xy_addr),
-        .rcmb_fat_bram_x_dout           (recomb_fat_bram_x_dout),
-        .rcmb_fat_bram_y_dout           (recomb_fat_bram_y_dout),
-        .rcmb_fat_bram_xy_dout_valid    (recomb_fat_bram_xy_dout_valid),
-        .rcmb_slim_bram_xy_bank         (recomb_slim_bram_xy_bank),
-        .rcmb_slim_bram_xy_addr         (recomb_slim_bram_xy_addr),
-        .rcmb_slim_bram_x_dout          (recomb_slim_bram_x_dout),
-        .rcmb_slim_bram_y_dout          (recomb_slim_bram_y_dout),
-        .rcmb_slim_bram_xy_dout_valid   (recomb_slim_bram_xy_dout_valid)
-    );
-    
-    reg [17:0] AB_READ[0:63];
-    reg [17:0] Q_READ[0:32];
-    reg [17:0] M_READ[0:64];
-    
-    always @(posedge clk) begin
-        //
-        if (recomb_fat_bram_xy_dout_valid)
-            //
-            case (recomb_fat_bram_xy_bank)
-                BANK_FAT_ABL: AB_READ[recomb_fat_bram_xy_addr % 32] <= recomb_fat_bram_x_dout;
-                BANK_FAT_ABH: AB_READ[32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
-                BANK_FAT_ML:  M_READ[recomb_fat_bram_xy_addr % 32] <= recomb_fat_bram_x_dout;
-                BANK_FAT_MH:  M_READ[32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
-                BANK_FAT_EXT: M_READ[64 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
-            endcase
-            //
-        if (recomb_slim_bram_xy_dout_valid)
-            //
-            case (recomb_slim_bram_xy_bank)
-                BANK_SLIM_Q: Q_READ[recomb_slim_bram_xy_addr] <= recomb_slim_bram_x_dout;
-                BANK_SLIM_EXT: if (recomb_slim_bram_xy_addr == 8'd1)
-                             Q_READ[32] <= recomb_slim_bram_x_dout;
-            endcase
-            //
-    end
-            
-
-    always @(posedge clk)
-        //
-        if (tb_fat_bram_xy_ena) begin
-            mgr_fat_bram_xy_ena  <= 1'b1;
-            mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank;
-            mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr;
-            mgr_fat_bram_x_din   <= tb_fat_bram_x_din;
-            mgr_fat_bram_y_din   <= tb_fat_bram_y_din;
-        end else if (recomb_fat_bram_xy_dout_valid) begin
-            mgr_fat_bram_xy_ena  <= 1'b1;
-            mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank;
-            mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr;
-            mgr_fat_bram_x_din   <= recomb_fat_bram_x_dout;
-            mgr_fat_bram_y_din   <= recomb_fat_bram_y_dout;
-        end else begin
-            mgr_fat_bram_xy_ena  <= 1'b0;
-            mgr_fat_bram_xy_bank <= 3'bXXX;
-            mgr_fat_bram_xy_addr <= 8'hXX;
-            mgr_fat_bram_x_din   <= {18{1'bX}};
-            mgr_fat_bram_y_din   <= {18{1'bX}};
-        end
-
-
-    always @(posedge clk)
-        //
-        if (tb_slim_bram_xy_ena) begin
-            mgr_slim_bram_xy_ena  <= 1'b1;
-            mgr_slim_bram_xy_bank <= tb_slim_bram_xy_bank;
-            mgr_slim_bram_xy_addr <= tb_slim_bram_xy_addr;
-            mgr_slim_bram_x_din   <= tb_slim_bram_x_din;
-            mgr_slim_bram_y_din   <= tb_slim_bram_y_din;
-        end else if (recomb_slim_bram_xy_dout_valid) begin
-            mgr_slim_bram_xy_ena  <= 1'b1;
-            mgr_slim_bram_xy_bank <= recomb_slim_bram_xy_bank;
-            mgr_slim_bram_xy_addr <= recomb_slim_bram_xy_addr;
-            mgr_slim_bram_x_din   <= recomb_slim_bram_x_dout;
-            mgr_slim_bram_y_din   <= recomb_slim_bram_y_dout;
-        end else begin
-            mgr_slim_bram_xy_ena  <= 1'b0;
-            mgr_slim_bram_xy_bank <= 3'bXXX;
-            mgr_slim_bram_xy_addr <= 8'hXX;
-            mgr_slim_bram_x_din   <= {18{1'bX}};
-            mgr_slim_bram_y_din   <= {18{1'bX}};
-        end
-
-
-    task verify_ab;
-        reg verify_ab_ok;
-        begin
-            verify_ab_ok = 1;
-            for (i=0; i<64; i=i+1)
-                if (AB_READ[i] === AB[i])
-                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]);
-                else begin
-                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x <???>", i, AB[i], AB_READ[i]);
-                    verify_ab_ok = 0;
-                end
-            if (verify_ab_ok)
-                $display("AB is OK.");
-            else
-                $display("AB is WRONG!");
-        end
-    endtask
-
-
-    task verify_q;
-        reg verify_q_ok;
-        begin
-            verify_q_ok = 1;
-            for (i=0; i<33; i=i+1)
-                if (Q_READ[i] === Q[i])
-                    $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x", i, Q[i], Q_READ[i]);
-                else begin
-                    $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x <???>", i, Q[i], Q_READ[i]);
-                    verify_q_ok = 0;
-                end
-            if (verify_q_ok)
-                $display("Q is OK.");
-            else
-                $display("Q is WRONG!");
-        end
-    endtask
-
-
-    task verify_m;
-        reg verify_m_ok;
-        begin
-            verify_m_ok = 1;
-            for (i=0; i<65; i=i+1)
-                if (M_READ[i] === M[i])
-                    $display("M / M_READ [%02d] = 0x%05x / 0x%05x", i, M[i], M_READ[i]);
-                else begin
-                    $display("M / M_READ [%02d] = 0x%05x / 0x%05x <???>", i, M[i], M_READ[i]);
-                    verify_m_ok = 0;
-                end
-            if (verify_m_ok)
-                $display("M is OK.");
-            else
-                $display("M is WRONG!");
-        end
-    endtask
-
-
-    wire mult_square_addr_done = mult_square_addr_surely_done_flop;
-    wire mult_triangle_addr_done = !col_is_last ? mult_triangle_addr_surely_done_flop : mult_triangle_addr_tardy_done_flop;
-    wire mult_rectangle_addr_done = mult_rectangle_addr_tardy_done_flop;
-    
-
-    always @* begin
-        //
-        fsm_state_next = FSM_STATE_IDLE;
-        //
-        case (fsm_state)
-            FSM_STATE_IDLE:                   fsm_state_next = ena                   ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
-                        
-            FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
-            
-            FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square    : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
-            
-            FSM_STATE_MULT_SQUARE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_MULT_TRIANGLE_COL_0_INIT : FSM_STATE_MULT_SQUARE_HOLDOFF;
-
-            FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ;
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ;
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: fsm_state_next = mult_triangle_addr_done ? FSM_STATE_MULT_TRIANGLE_COL_N_INIT : FSM_STATE_MULT_TRIANGLE_COL_0_BUSY;     
-            
-            FSM_STATE_MULT_TRIANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_TRIG ;
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_BUSY ;
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: fsm_state_next = mult_triangle_addr_done ? fsm_state_after_mult_triangle : FSM_STATE_MULT_TRIANGLE_COL_N_BUSY;
-            
-            FSM_STATE_MULT_TRIANGLE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_MULT_RECTANGLE_COL_0_INIT : FSM_STATE_MULT_TRIANGLE_HOLDOFF;
-
-            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_TRIG ;
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_BUSY ;
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY: fsm_state_next = mult_rectangle_addr_done ? FSM_STATE_MULT_RECTANGLE_COL_N_INIT : FSM_STATE_MULT_RECTANGLE_COL_0_BUSY;     
-            
-            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_TRIG ;
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_BUSY ;
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: fsm_state_next = mult_rectangle_addr_done ? fsm_state_after_mult_rectangle : FSM_STATE_MULT_RECTANGLE_COL_N_BUSY;
-            
-            FSM_STATE_MULT_RECTANGLE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_STOP : FSM_STATE_MULT_RECTANGLE_HOLDOFF;
-            
-            default:                          fsm_state_next =                         FSM_STATE_IDLE                   ;
-
-        endcase
-        //
-    end
-    
-    
-    
-    //
-    // Reductor
-    //
-    reg reductor_ena = 1'b0;
-    
-    always @(posedge clk)
-        //
-        if (!reductor_ena)
-            case (fsm_state)
-                FSM_STATE_MULT_RECTANGLE_COL_0_INIT: reductor_ena <= 1'b1;  
-            endcase
-        else begin
-        
-        
-        end
-    
-    
-    reg recomb_fat_bram_xy_dout_valid_dly1;
-    reg recomb_fat_bram_xy_dout_valid_dly2;
-    reg recomb_fat_bram_xy_dout_valid_dly3;
-
-    reg [2:0] recomb_fat_bram_xy_bank_dly1;
-    reg [2:0] recomb_fat_bram_xy_bank_dly2;
-    reg [2:0] recomb_fat_bram_xy_bank_dly3;
-
-    reg [7:0] recomb_fat_bram_xy_addr_dly1;
-    reg [7:0] recomb_fat_bram_xy_addr_dly2;
-    reg [7:0] recomb_fat_bram_xy_addr_dly3;
-    
-    always @(posedge clk) begin
-        //
-        recomb_fat_bram_xy_dout_valid_dly1 <= recomb_fat_bram_xy_dout_valid;
-        recomb_fat_bram_xy_dout_valid_dly2 <= recomb_fat_bram_xy_dout_valid_dly1;
-        recomb_fat_bram_xy_dout_valid_dly3 <= recomb_fat_bram_xy_dout_valid_dly2;
-        //
-    end
-    
-    reg [17:0] recomb_fat_bram_x_dout_dly1;
-    reg [17:0] recomb_fat_bram_x_dout_dly2;
-    reg [17:0] recomb_fat_bram_x_dout_dly3;
-    
-    always @(posedge clk) begin
-        //
-        if (recomb_fat_bram_xy_dout_valid) recomb_fat_bram_x_dout_dly1 <= recomb_fat_bram_x_dout;
-        if (recomb_fat_bram_xy_dout_valid_dly1) recomb_fat_bram_x_dout_dly2 <= recomb_fat_bram_x_dout_dly1;
-        if (recomb_fat_bram_xy_dout_valid_dly2) recomb_fat_bram_x_dout_dly3 <= recomb_fat_bram_x_dout_dly2;
-        //
-    end
-    
-    always @(posedge clk) begin
-        //
-        if (recomb_fat_bram_xy_dout_valid) recomb_fat_bram_xy_bank_dly1 <= recomb_fat_bram_xy_bank;
-        if (recomb_fat_bram_xy_dout_valid_dly1) recomb_fat_bram_xy_bank_dly2 <= recomb_fat_bram_xy_bank_dly1;
-        if (recomb_fat_bram_xy_dout_valid_dly2) recomb_fat_bram_xy_bank_dly3 <= recomb_fat_bram_xy_bank_dly2;
-        //
-    end
-    
-    always @(posedge clk) begin
-        //
-        if (recomb_fat_bram_xy_dout_valid) recomb_fat_bram_xy_addr_dly1 <= recomb_fat_bram_xy_addr;
-        if (recomb_fat_bram_xy_dout_valid_dly1) recomb_fat_bram_xy_addr_dly2 <= recomb_fat_bram_xy_addr_dly1;
-        if (recomb_fat_bram_xy_dout_valid_dly2) recomb_fat_bram_xy_addr_dly3 <= recomb_fat_bram_xy_addr_dly2;
-        //
-    end
-    
-   
-    reg [ 1:0] reductor_fat_bram_x_lsb_carry;
-    reg [15:0] reductor_fat_bram_x_lsb_dummy;
-    reg [17:0] reductor_fat_bram_x_lsb_dout;
-
-    reg [17:0] reductor_fat_bram_x_msb_dout;
-    
-    always @(posedge clk)
-        //
-        if (!reductor_ena) begin
-            reductor_fat_bram_x_lsb_carry <= 2'b00;
-        end else if (recomb_fat_bram_xy_dout_valid_dly3) begin
-            
-            case (recomb_fat_bram_xy_bank_dly3)
-                BANK_FAT_ML: {reductor_fat_bram_x_lsb_carry, reductor_fat_bram_x_lsb_dummy} <= recomb_fat_bram_x_dout_dly3 + mac_fat_bram_x_dout_aux + reductor_fat_bram_x_lsb_carry;
-                BANK_FAT_MH:
-                    if (recomb_fat_bram_xy_addr_dly3 == 8'd0)
-                        {reductor_fat_bram_x_lsb_carry, reductor_fat_bram_x_lsb_dummy} <= recomb_fat_bram_x_dout_dly3 + mac_fat_bram_x_dout_aux + reductor_fat_bram_x_lsb_carry;
-                    else if (recomb_fat_bram_xy_addr_dly3 == 8'd1)
-                        reductor_fat_bram_x_msb_dout <= recomb_fat_bram_x_dout_dly3 + mac_fat_bram_x_dout_aux + reductor_fat_bram_x_lsb_carry; 
-                    else
-                        reductor_fat_bram_x_msb_dout <= recomb_fat_bram_x_dout_dly3 + mac_fat_bram_x_dout_aux;
-                BANK_FAT_EXT:
-                    reductor_fat_bram_x_msb_dout <= recomb_fat_bram_x_dout_dly3;
-            endcase
-            // 
-        end
-    /*
-
-
-    reg [17:0] recomb_fat_bram_x_dout_dly1;
-    reg [17:0] recomb_fat_bram_x_dout_dly2;
-
-    reg [ 2:0] recomb_fat_bram_xy_bank_dly1;
-    reg [ 2:0] recomb_fat_bram_xy_bank_dly2;
-    
-    reg [1:0] reductor_fat_bram_x_carry;
-    
-    reg [15:0] reductor_fat_bram_x_dummy;
-    reg [17:0] reductor_fat_bram_x_dout;
-    reg        reductor_fat_bram_xy_dout_valid;
-    
-    always @(posedge clk)
-        //
-        if (reductor_ena) begin
-        
-            if (recomb_fat_bram_xy_dout_valid) begin
-                recomb_fat_bram_x_dout_dly1 <= recomb_fat_bram_x_dout;
-                recomb_fat_bram_xy_bank_dly1 <= recomb_fat_bram_xy_bank;
-            end
-                
-            if (mac_fat_bram_xy_ena_aux) begin
-                recomb_fat_bram_x_dout_dly2 <= recomb_fat_bram_x_dout_dly1;
-                recomb_fat_bram_xy_bank_dly2 <= recomb_fat_bram_xy_bank_dly1;
-            end
-
-            if (mac_fat_bram_xy_reg_ena_aux)
-                case (recomb_fat_bram_xy_bank_dly2)
-                    BANK_FAT_ML: {reductor_fat_bram_x_carry, reductor_fat_bram_x_dummy} <= recomb_fat_bram_x_dout_dly2 + mac_fat_bram_x_dout_aux + reductor_fat_bram_x_carry;
-                endcase
-    
-            //reductor_fat_bram_xy_dout_valid <= mac_fat_bram_xy_reg_ena_aux;
-            
-        end else begin
-        
-            reductor_fat_bram_x_carry <= 2'b00;
-            reductor_fat_bram_xy_dout_valid <= 1'b0;
-        
-        end
-    */
-    
-    
-    
-endmodule
-
diff --git a/rtl/modexpng_core_top.v b/rtl/modexpng_core_top.v
index c78a969..dea7f0a 100644
--- a/rtl/modexpng_core_top.v
+++ b/rtl/modexpng_core_top.v
@@ -87,7 +87,9 @@ module modexpng_core_top
     wire uop_opcode_is_wrk    = (uop_data_opcode == UOP_OPCODE_PROPAGATE_CARRIES   ) ||
                                 (uop_data_opcode == UOP_OPCODE_COPY_CRT_Y2X        ) ||
                                 (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_INIT ) ||
-                                (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y    ) ;
+                                (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y    ) ||
+                                (uop_data_opcode == UOP_OPCODE_CROSS_LADDERS_X2Y   ) ||
+                                (uop_data_opcode == UOP_OPCODE_MODULAR_SUBTRACT    ) ;
                                 
     wire uop_loop_now;
     
@@ -1113,8 +1115,15 @@ module modexpng_core_top
                     wrk_sel_narrow_out  <= uop_data_sel_narrow_out;
                 end
                 //
+                UOP_OPCODE_MODULAR_SUBTRACT: begin
+                    wrk_sel_wide_out   <= uop_data_sel_wide_out;
+                    wrk_sel_narrow_in  <= uop_data_sel_narrow_in;
+                    wrk_sel_narrow_out <= uop_data_sel_narrow_out;
+                end
+                //
                 UOP_OPCODE_COPY_CRT_Y2X,
-                UOP_OPCODE_COPY_LADDERS_X2Y: begin
+                UOP_OPCODE_COPY_LADDERS_X2Y,
+                UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                     wrk_sel_wide_in    <= uop_data_sel_wide_in;
                     wrk_sel_wide_out   <= uop_data_sel_wide_out;
                     wrk_sel_narrow_in  <= uop_data_sel_narrow_in;
@@ -1157,7 +1166,8 @@ module modexpng_core_top
                 //
                 UOP_OPCODE_PROPAGATE_CARRIES,
                 UOP_OPCODE_COPY_CRT_Y2X,
-                UOP_OPCODE_COPY_LADDERS_X2Y:
+                UOP_OPCODE_COPY_LADDERS_X2Y,
+                UOP_OPCODE_CROSS_LADDERS_X2Y:
                     wrk_word_index_last <= uop_npq_is_n ? word_index_last_n : word_index_last_pq;
                 //
                 UOP_OPCODE_MODULAR_REDUCE_INIT: begin
@@ -1171,6 +1181,10 @@ module modexpng_core_top
                     {rdct_word_index_last_x,       rdct_word_index_last_y      } <= {2{word_index_last_pq       }};
                 end
                 //
+                UOP_OPCODE_MODULAR_SUBTRACT: begin
+                    wrk_word_index_last <= uop_npq_is_n ? word_index_last_n : word_index_last_pq;
+                end
+                //
                 UOP_OPCODE_LADDER_INIT: begin
                     io_mgr_word_index_last <= OP_ADDR_LADDER_LAST;
                     io_mgr_ladder_steps    <= crt_mode ? bit_index_last_pq : bit_index_last_n;
diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v
index 269ef98..74c939b 100644
--- a/rtl/modexpng_general_worker.v
+++ b/rtl/modexpng_general_worker.v
@@ -1,70 +1,22 @@
 module modexpng_general_worker
 (
-    clk,
-    rst,
-    
-    ena,
-    rdy,
-    
-    sel_narrow_in,
-    sel_narrow_out,
-    sel_wide_in,
-    sel_wide_out,
-    
+    clk, rst,
+    ena, rdy,
+    sel_narrow_in, sel_narrow_out,
+    sel_wide_in,   sel_wide_out,
     opcode,
-    
-    word_index_last,
-    word_index_last_half,
-    
-    wrk_rd_wide_xy_ena_x,
-    wrk_rd_wide_xy_bank_x,
-    wrk_rd_wide_xy_addr_x,
-    wrk_rd_wide_x_din_x,
-    wrk_rd_wide_y_din_x,
-
-    wrk_rd_narrow_xy_ena_x,
-    wrk_rd_narrow_xy_bank_x,
-    wrk_rd_narrow_xy_addr_x,
-    wrk_rd_narrow_x_din_x,
-    wrk_rd_narrow_y_din_x,
-
-    wrk_rd_wide_xy_ena_y,
-    wrk_rd_wide_xy_bank_y,
-    wrk_rd_wide_xy_addr_y,
-    wrk_rd_wide_x_din_y,
-    wrk_rd_wide_y_din_y,
-
-    wrk_rd_narrow_xy_ena_y,
-    wrk_rd_narrow_xy_bank_y,
-    wrk_rd_narrow_xy_addr_y,
-    wrk_rd_narrow_x_din_y,
-    wrk_rd_narrow_y_din_y,
-    
-    wrk_wr_wide_xy_ena_x,
-    wrk_wr_wide_xy_bank_x,
-    wrk_wr_wide_xy_addr_x,
-    wrk_wr_wide_x_dout_x,
-    wrk_wr_wide_y_dout_x,
-
-    wrk_wr_narrow_xy_ena_x,
-    wrk_wr_narrow_xy_bank_x,
-    wrk_wr_narrow_xy_addr_x,
-    wrk_wr_narrow_x_dout_x,
-    wrk_wr_narrow_y_dout_x,
-
-    wrk_wr_wide_xy_ena_y,
-    wrk_wr_wide_xy_bank_y,
-    wrk_wr_wide_xy_addr_y,
-    wrk_wr_wide_x_dout_y,
-    wrk_wr_wide_y_dout_y,
-
-    wrk_wr_narrow_xy_ena_y,
-    wrk_wr_narrow_xy_bank_y,
-    wrk_wr_narrow_xy_addr_y,
-    wrk_wr_narrow_x_dout_y,
-    wrk_wr_narrow_y_dout_y
+    word_index_last, word_index_last_half,
+    wrk_rd_wide_xy_ena_x,   wrk_rd_wide_xy_bank_x,   wrk_rd_wide_xy_addr_x,   wrk_rd_wide_x_din_x,    wrk_rd_wide_y_din_x,
+    wrk_rd_narrow_xy_ena_x, wrk_rd_narrow_xy_bank_x, wrk_rd_narrow_xy_addr_x, wrk_rd_narrow_x_din_x,  wrk_rd_narrow_y_din_x,
+    wrk_rd_wide_xy_ena_y,   wrk_rd_wide_xy_bank_y,   wrk_rd_wide_xy_addr_y,   wrk_rd_wide_x_din_y,    wrk_rd_wide_y_din_y,
+    wrk_rd_narrow_xy_ena_y, wrk_rd_narrow_xy_bank_y, wrk_rd_narrow_xy_addr_y, wrk_rd_narrow_x_din_y,  wrk_rd_narrow_y_din_y,
+    wrk_wr_wide_xy_ena_x,   wrk_wr_wide_xy_bank_x,   wrk_wr_wide_xy_addr_x,   wrk_wr_wide_x_dout_x,   wrk_wr_wide_y_dout_x,
+    wrk_wr_narrow_xy_ena_x, wrk_wr_narrow_xy_bank_x, wrk_wr_narrow_xy_addr_x, wrk_wr_narrow_x_dout_x, wrk_wr_narrow_y_dout_x,
+    wrk_wr_wide_xy_ena_y,   wrk_wr_wide_xy_bank_y,   wrk_wr_wide_xy_addr_y,   wrk_wr_wide_x_dout_y,   wrk_wr_wide_y_dout_y,
+    wrk_wr_narrow_xy_ena_y, wrk_wr_narrow_xy_bank_y, wrk_wr_narrow_xy_addr_y, wrk_wr_narrow_x_dout_y, wrk_wr_narrow_y_dout_y
 );
 
+
     //
     // Headers
     //
@@ -143,30 +95,44 @@ module modexpng_general_worker
     //
     // FSM Declaration
     //
-    localparam [4:0] WRK_FSM_STATE_IDLE             = 5'h00;
+    localparam [5:0] WRK_FSM_STATE_IDLE             = 6'h00;
     
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1     = 5'h01;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2     = 5'h02;
-    localparam [4:0] WRK_FSM_STATE_BUSY             = 5'h03;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1    = 5'h05;    // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug!
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2    = 5'h06;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1     = 6'h01;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2     = 6'h02;
+    localparam [5:0] WRK_FSM_STATE_BUSY             = 6'h03;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1    = 6'h05;    // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug!
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2    = 6'h06;
     
-    localparam [4:0] WRK_FSM_STATE_STOP             = 5'h07;
+    localparam [5:0] WRK_FSM_STATE_STOP             = 6'h07;
     
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M1  = 5'h10;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M2  = 5'h11;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M1  = 5'h12;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M2  = 5'h13;
-    localparam [4:0] WRK_FSM_STATE_BUSY_M1          = 5'h14;
-    localparam [4:0] WRK_FSM_STATE_BUSY_M2          = 5'h15;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 5'h16;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 5'h17;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 5'h18;
-    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 5'h19;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M1  = 6'h10;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M2  = 6'h11;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M1  = 6'h12;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M2  = 6'h13;
+    localparam [5:0] WRK_FSM_STATE_BUSY_M1          = 6'h14;
+    localparam [5:0] WRK_FSM_STATE_BUSY_M2          = 6'h15;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 6'h16;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 6'h17;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 6'h18;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 6'h19;
+
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_TP  = 6'h20;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_TP  = 6'h21;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE3_TP  = 6'h22;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE4_TP  = 6'h23;
+    localparam [5:0] WRK_FSM_STATE_BUSY_TP          = 6'h24;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_TP = 6'h25;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_TP = 6'h26;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST3_TP = 6'h27;
+    localparam [5:0] WRK_FSM_STATE_LATENCY_POST4_TP = 6'h28;
+    localparam [5:0] WRK_FSM_STATE_HOLDOFF_TP       = 6'h29;
     
-    reg [4:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
-    reg [4:0] wrk_fsm_state_next_one_pass;         // single address space sweep
-    reg [4:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y)
+    reg [5:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
+    reg [5:0] wrk_fsm_state_next_one_pass;         // single address space sweep
+    reg [5:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y)
+    reg [5:0] wrk_fsm_state_next_two_pass;         // two address space sweeps
+    reg       wrk_fsm_two_pass_pass;               // 0=first pass, 1=second pass
+    reg       wrk_fsm_two_pass_pass_dly;           // 0=first pass, 1=second pass
 
 
     // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps)
@@ -292,37 +258,36 @@ module modexpng_general_worker
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1;
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2;
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3;
+    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly1;
+    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly2;
     
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1;
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2;
     reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3;
-
+    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly1;
+    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly2;
     
     always @(posedge clk) begin
         //
-        {rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x};
-        {rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y};        
-        //
-        {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2};
-        {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2};                
-        //
-        {rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x};
-        {rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y};
+        {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x};
+        {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y};        
         //
-        {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2};
-        {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2};
+        {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x};
+        {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y};
         //
         {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x};
         {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y};
         //
         {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x};
-        {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y};    
+        {wrk_rd_narrow_y_din_x_dly2, wrk_rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x_dly1, wrk_rd_narrow_y_din_x};
+        {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y};
+        {wrk_rd_narrow_y_din_y_dly2, wrk_rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y_dly1, wrk_rd_narrow_y_din_y};
         //
     end
         
 
     //
-    // Read Enable Logic
+    // Source Read Enable Logic
     //
     
     task _update_wide_xy_rd_en;   input _en; {rd_wide_xy_ena_x,   rd_wide_xy_ena_y  } <= {2{_en}}; endtask
@@ -340,48 +305,54 @@ module modexpng_general_worker
             //
             disable_wide_xy_rd_en;
             disable_narrow_xy_rd_en;
-            /*
-            rd_wide_xy_ena_x    <= 1'b0;
-            rd_wide_xy_ena_y    <= 1'b0;
-            rd_narrow_xy_ena_x  <= 1'b0;
-            rd_narrow_xy_ena_y  <= 1'b0;
-            */
+            //
         end else begin
             //
             disable_wide_xy_rd_en;
             disable_narrow_xy_rd_en;
             //
-            //rd_wide_xy_ena_x    <= 1'b0;
-            //rd_wide_xy_ena_y    <= 1'b0;
-            //rd_narrow_xy_ena_x  <= 1'b0;
-            //rd_narrow_xy_ena_y  <= 1'b0;
+            // one_pass
             //
-            case (opcode)
+            case (wrk_fsm_state_next_one_pass)
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES,
-                UOP_OPCODE_OUTPUT_FROM_NARROW,
-                UOP_OPCODE_MODULAR_REDUCE_INIT:
+                WRK_FSM_STATE_LATENCY_PRE1,
+                WRK_FSM_STATE_LATENCY_PRE2,
+                WRK_FSM_STATE_BUSY:
                     //
-                    case (wrk_fsm_state_next_one_pass)
+                    case (opcode)
                         //
-                        WRK_FSM_STATE_LATENCY_PRE1,
-                        WRK_FSM_STATE_LATENCY_PRE2,
-                        WRK_FSM_STATE_BUSY:
+                        UOP_OPCODE_PROPAGATE_CARRIES,
+                        UOP_OPCODE_OUTPUT_FROM_NARROW,
+                        UOP_OPCODE_MODULAR_REDUCE_INIT:
                             //
                             enable_narrow_xy_rd_en;
-                            //{rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{1'b1}};
                             //
+                        UOP_OPCODE_COPY_CRT_Y2X: begin
+                            //
+                            enable_wide_xy_rd_en;
+                            enable_narrow_xy_rd_en;                            
+                            //
+                        end
                         //
                     endcase
-                    //
                 //
-                UOP_OPCODE_COPY_CRT_Y2X:
+            endcase
+            //
+            // one_pass_meander
+            //
+            case (wrk_fsm_state_next_one_pass_meander)
+                //
+                WRK_FSM_STATE_LATENCY_PRE1_M1,
+                WRK_FSM_STATE_LATENCY_PRE1_M2,
+                WRK_FSM_STATE_LATENCY_PRE2_M1,
+                WRK_FSM_STATE_LATENCY_PRE2_M2,
+                WRK_FSM_STATE_BUSY_M1,
+                WRK_FSM_STATE_BUSY_M2:
                     //
-                    case (wrk_fsm_state_next_one_pass)
+                    case (opcode)
                         //
-                        WRK_FSM_STATE_LATENCY_PRE1,
-                        WRK_FSM_STATE_LATENCY_PRE2,
-                        WRK_FSM_STATE_BUSY: begin
+                        UOP_OPCODE_COPY_LADDERS_X2Y,
+                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                             //
                             enable_wide_xy_rd_en;
                             enable_narrow_xy_rd_en;
@@ -389,24 +360,29 @@ module modexpng_general_worker
                         end
                         //
                     endcase
+                //
+            endcase
+            //
+            // two_pass
+            //
+            case (wrk_fsm_state_next_two_pass)
+                //
+                WRK_FSM_STATE_LATENCY_PRE1_TP,
+                WRK_FSM_STATE_LATENCY_PRE2_TP,
+                WRK_FSM_STATE_LATENCY_PRE3_TP,
+                WRK_FSM_STATE_LATENCY_PRE4_TP,
+                WRK_FSM_STATE_BUSY_TP:
                     //
-                UOP_OPCODE_COPY_LADDERS_X2Y:
-                    //
-                    case (wrk_fsm_state_next_one_pass_meander)
-                        //
-                        WRK_FSM_STATE_LATENCY_PRE1_M1,
-                        WRK_FSM_STATE_LATENCY_PRE1_M2,
-                        WRK_FSM_STATE_LATENCY_PRE2_M1,
-                        WRK_FSM_STATE_LATENCY_PRE2_M2,
-                        WRK_FSM_STATE_BUSY_M1,
-                        WRK_FSM_STATE_BUSY_M2: begin
-                            //
-                            enable_wide_xy_rd_en;
-                            enable_narrow_xy_rd_en;
+                    case (opcode)
+                        UOP_OPCODE_MODULAR_SUBTRACT:
                             //
-                        end
+                            if (!wrk_fsm_two_pass_pass) begin
+                                enable_wide_xy_rd_en;
+                                enable_narrow_xy_rd_en;
+                            end else
+                                enable_narrow_xy_rd_en;
                         //
-                    endcase                    
+                    endcase
                 //
             endcase
             //
@@ -414,7 +390,7 @@ module modexpng_general_worker
 
 
     //
-    // Write Enable Logic
+    // Destination Write Enable Logic
     //
     
     task _update_wide_xy_wr_en;   input _en; {wr_wide_xy_ena_x,   wr_wide_xy_ena_y  } <= {2{_en}}; endtask
@@ -432,71 +408,53 @@ module modexpng_general_worker
             //
             disable_wide_xy_wr_en;
             disable_narrow_xy_wr_en;
-            //wr_wide_xy_ena_x    <= 1'b0;
-            //wr_wide_xy_ena_y    <= 1'b0;
-            //wr_narrow_xy_ena_x  <= 1'b0;
-            //wr_narrow_xy_ena_y  <= 1'b0;
             //
         end else begin
             //
             disable_wide_xy_wr_en;
             disable_narrow_xy_wr_en;
             //
-            //wr_wide_xy_ena_x    <= 1'b0;
-            //wr_wide_xy_ena_y    <= 1'b0;
-            //wr_narrow_xy_ena_x  <= 1'b0;
-            //wr_narrow_xy_ena_y  <= 1'b0;
+            // one_pass
             //
-            case (opcode)
+            case (wrk_fsm_state)
                 //
-                UOP_OPCODE_PROPAGATE_CARRIES:
+                WRK_FSM_STATE_BUSY,
+                WRK_FSM_STATE_LATENCY_POST1,
+                WRK_FSM_STATE_LATENCY_POST2:
                     //
-                    case (wrk_fsm_state)
+                    case (opcode)
                         //
-                        WRK_FSM_STATE_BUSY,
-                        WRK_FSM_STATE_LATENCY_POST1,
-                        WRK_FSM_STATE_LATENCY_POST2:
+                        UOP_OPCODE_PROPAGATE_CARRIES:
                             //
                             enable_narrow_xy_wr_en;
                             //
-                        //
-                    endcase
-                    //
-                UOP_OPCODE_COPY_CRT_Y2X:
-                    //
-                    case (wrk_fsm_state)
-                        //
-                        WRK_FSM_STATE_BUSY,
-                        WRK_FSM_STATE_LATENCY_POST1,
-                        WRK_FSM_STATE_LATENCY_POST2: begin
+                        UOP_OPCODE_COPY_CRT_Y2X: begin
                             //
                             enable_wide_xy_wr_en;
-                            enable_narrow_xy_wr_en;
+                            enable_narrow_xy_wr_en;                            
                             //
                         end
                         //
-                    endcase
-                    //
-                UOP_OPCODE_MODULAR_REDUCE_INIT:
-                    //
-                    case (wrk_fsm_state)
-                        //
-                        WRK_FSM_STATE_BUSY,
-                        WRK_FSM_STATE_LATENCY_POST1,
-                        WRK_FSM_STATE_LATENCY_POST2:
+                        UOP_OPCODE_MODULAR_REDUCE_INIT:
                             //
                             enable_wide_xy_wr_en;
-                            //
                         //
                     endcase
+                //
+            endcase
+            //
+            // one_pass_meander
+            //
+            case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_BUSY_M2,
+                WRK_FSM_STATE_LATENCY_POST1_M2,
+                WRK_FSM_STATE_LATENCY_POST2_M2:
                     //
-                UOP_OPCODE_COPY_LADDERS_X2Y:
-                    //
-                    case (wrk_fsm_state)
+                    case (opcode)
                         //
-                        WRK_FSM_STATE_BUSY_M2,
-                        WRK_FSM_STATE_LATENCY_POST1_M2,
-                        WRK_FSM_STATE_LATENCY_POST2_M2: begin
+                        UOP_OPCODE_COPY_LADDERS_X2Y,
+                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                             //
                             enable_wide_xy_wr_en;
                             enable_narrow_xy_wr_en;
@@ -507,12 +465,42 @@ module modexpng_general_worker
                 //
             endcase
             //
+            // two_pass
+            //
+            case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_BUSY_TP,
+                WRK_FSM_STATE_LATENCY_POST1_TP,
+                WRK_FSM_STATE_LATENCY_POST2_TP,
+                WRK_FSM_STATE_LATENCY_POST3_TP,
+                WRK_FSM_STATE_LATENCY_POST4_TP:
+                    //
+                    case (opcode)
+                        //
+                        UOP_OPCODE_MODULAR_SUBTRACT:
+                            //
+                            if (!wrk_fsm_two_pass_pass)
+                                enable_narrow_xy_wr_en;
+                            else begin
+                                enable_wide_xy_wr_en;
+                                enable_narrow_xy_wr_en;
+                            end
+                        //
+                    endcase
+                //
+            endcase
+            //
         end
 
 
     //
-    // Data Logic
+    // Source to Destination Data Logic
+    //
+    
+    //
+    // UOP_OPCODE_PROPAGATE_CARRIES
     //
+
     reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r;
     reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r;
     reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r;
@@ -523,112 +511,300 @@ module modexpng_general_worker
     wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r};
     wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r};
     
+    wire [CARRY_W -1:0] rd_narrow_x_din_x_w_cry_msb = rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] rd_narrow_y_din_x_w_cry_msb = rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] rd_narrow_x_din_y_w_cry_msb = rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W];
+    wire [CARRY_W -1:0] rd_narrow_y_din_y_w_cry_msb = rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W];
+    
     wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]};
     wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]};
     wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]};
     wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]};
     
+    task update_wide_dout;
+        input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
+        {wr_wide_x_dout_x, wr_wide_y_dout_x, wr_wide_x_dout_y, wr_wide_y_dout_y} <=
+        {        x_x,              y_x,              x_y,              y_y     };
+    endtask
+    
+    task update_narrow_dout;
+        input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
+        {wr_narrow_x_dout_x, wr_narrow_y_dout_x, wr_narrow_x_dout_y, wr_narrow_y_dout_y} <=
+        {          x_x,                y_x,                x_y,                y_y     };
+    endtask
+    
+    task update_narrow_carries;
+        input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry;
+        {rd_narrow_x_din_x_cry_r, rd_narrow_y_din_x_cry_r, rd_narrow_x_din_y_cry_r, rd_narrow_y_din_y_cry_r} <=
+        {          x_x_cry,                 y_x_cry,                 x_y_cry,                 y_y_cry      };
+    endtask
+        
+    
+    always @(posedge clk)
+        //
+        if (opcode == UOP_OPCODE_PROPAGATE_CARRIES)
+            //
+            case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_LATENCY_PRE2:
+                    //
+                    update_narrow_carries(CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO);
+                //
+                WRK_FSM_STATE_BUSY,
+                WRK_FSM_STATE_LATENCY_POST1:
+                    //
+                    update_narrow_carries(rd_narrow_x_din_x_w_cry_msb,
+                                          rd_narrow_y_din_x_w_cry_msb,
+                                          rd_narrow_x_din_y_w_cry_msb,
+                                          rd_narrow_y_din_y_w_cry_msb);
+                //
+            endcase
+
+
+    //
+    // UOP_OPCODE_MODULAR_SUBTRACT
+    //
+    
+    reg [WORD_W:0] modsub_x_ab; 
+    reg [WORD_W:0] modsub_y_ab;
+
+    reg [WORD_W:0] modsub_x_ab_dly; 
+    reg [WORD_W:0] modsub_y_ab_dly;
+
+    reg [WORD_W:0] modsub_x_abn; 
+    reg [WORD_W:0] modsub_y_abn;    
+    
+    reg            modsub_x_ab_mask_now;
+    reg            modsub_y_ab_mask_now;
+
+    reg            modsub_x_abn_mask_now;
+    reg            modsub_y_abn_mask_now;
+
+    reg            modsub_x_borrow_r;
+    reg            modsub_y_borrow_r;
+    
+    wire           modsub_x_ab_masked = modsub_x_ab_mask_now ? 1'b0 : modsub_x_ab[WORD_W];  
+    wire           modsub_y_ab_masked = modsub_y_ab_mask_now ? 1'b0 : modsub_y_ab[WORD_W];
+
+    wire           modsub_x_abn_masked = modsub_x_abn_mask_now ? 1'b0 : modsub_x_abn[WORD_W];  
+    wire           modsub_y_abn_masked = modsub_y_abn_mask_now ? 1'b0 : modsub_y_abn[WORD_W];
+
+    wire [WORD_W:0] modsub_x_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
+    wire [WORD_W:0] modsub_y_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
+    wire [WORD_W:0] modsub_x_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
+    wire [WORD_W:0] modsub_y_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
+    
+    wire [WORD_W:0] modsub_x_wide_x_lsb_pad = {1'b0, wrk_rd_wide_x_din_x_dly1[WORD_W-1:0]};
+    wire [WORD_W:0] modsub_x_wide_y_lsb_pad = {1'b0, wrk_rd_wide_x_din_y_dly1[WORD_W-1:0]};
+    
+    wire [WORD_EXT_W -1:0] modsub_x_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_x_ab_dly[WORD_W-1:0]};  
+    wire [WORD_EXT_W -1:0] modsub_y_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_y_ab_dly[WORD_W-1:0]};
+
+    wire [WORD_EXT_W -1:0] modsub_x_abn_trunc = {{CARRY_W{1'b0}}, modsub_x_abn[WORD_W-1:0]};  
+    wire [WORD_EXT_W -1:0] modsub_y_abn_trunc = {{CARRY_W{1'b0}}, modsub_y_abn[WORD_W-1:0]};
+    
+    wire [WORD_EXT_W -1:0] modsub_x_mux = !modsub_x_borrow_r ? wrk_rd_narrow_x_din_x_dly2 : wrk_rd_narrow_y_din_x_dly2;
+    wire [WORD_EXT_W -1:0] modsub_y_mux = !modsub_y_borrow_r ? wrk_rd_narrow_x_din_y_dly2 : wrk_rd_narrow_y_din_y_dly2;
+
+    wire [WORD_W:0] modsub_x_ab_lsb_pad = {1'b0, modsub_x_ab[WORD_W-1:0]};
+    wire [WORD_W:0] modsub_y_ab_lsb_pad = {1'b0, modsub_y_ab[WORD_W-1:0]};
+    
+    task update_modsub_ab;
+        begin
+            modsub_x_ab <= modsub_x_narrow_x_lsb_pad - modsub_y_narrow_x_lsb_pad - modsub_x_ab_masked;
+            modsub_y_ab <= modsub_x_narrow_y_lsb_pad - modsub_y_narrow_y_lsb_pad - modsub_y_ab_masked;
+        end
+    endtask
+
+    task update_modsub_abn;
+        begin
+            modsub_x_abn <= modsub_x_ab_lsb_pad + modsub_x_wide_x_lsb_pad + modsub_x_abn_masked;
+            modsub_y_abn <= modsub_y_ab_lsb_pad + modsub_x_wide_y_lsb_pad + modsub_y_abn_masked;
+        end
+    endtask
+    
+    always @(posedge clk)
+        //
+        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
+            //
+            case (wrk_fsm_state)
+                WRK_FSM_STATE_LATENCY_POST4_TP:
+                    if (!wrk_fsm_two_pass_pass)
+                        {modsub_x_borrow_r, modsub_y_borrow_r} <= {modsub_x_ab_dly[WORD_W], modsub_y_ab_dly[WORD_W]};
+            endcase
+    
+    always @(posedge clk) begin
+        modsub_x_ab_dly <= modsub_x_ab;  
+        modsub_y_ab_dly <= modsub_y_ab;
+    end
+    
     always @(posedge clk) begin
         //
-        wr_wide_x_dout_x    <= WORD_EXT_DNC;
-        wr_wide_y_dout_x    <= WORD_EXT_DNC;
-        wr_wide_x_dout_y    <= WORD_EXT_DNC;
-        wr_wide_y_dout_y    <= WORD_EXT_DNC;
-        wr_narrow_x_dout_x  <= WORD_EXT_DNC;
-        wr_narrow_y_dout_x  <= WORD_EXT_DNC;
-        wr_narrow_x_dout_y  <= WORD_EXT_DNC;
-        wr_narrow_y_dout_y  <= WORD_EXT_DNC;
+        modsub_x_ab <= {1'bX, WORD_DNC};
+        modsub_y_ab <= {1'bX, WORD_DNC};
         //
-        case (opcode)
+        modsub_x_abn <= {1'bX, WORD_DNC};
+        modsub_y_abn <= {1'bX, WORD_DNC};
+        //
+        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES:
+            case (wrk_fsm_state)
                 //
-                case (wrk_fsm_state)
+                WRK_FSM_STATE_LATENCY_PRE3_TP:
+                    update_modsub_ab;
+                    
+                WRK_FSM_STATE_LATENCY_PRE4_TP,
+                WRK_FSM_STATE_BUSY_TP,
+                WRK_FSM_STATE_LATENCY_POST1_TP,
+                WRK_FSM_STATE_LATENCY_POST2_TP: begin
+                    update_modsub_ab;
+                    update_modsub_abn;
+                end
+                //
+                WRK_FSM_STATE_LATENCY_POST3_TP:
                     //
-                    WRK_FSM_STATE_LATENCY_PRE2: begin
-                        rd_narrow_x_din_x_cry_r <= CARRY_ZERO;
-                        rd_narrow_y_din_x_cry_r <= CARRY_ZERO;
-                        rd_narrow_x_din_y_cry_r <= CARRY_ZERO;
-                        rd_narrow_y_din_y_cry_r <= CARRY_ZERO;
-                    end
+                    update_modsub_abn;
+                //
+            endcase
+        //
+    end
+
+    always @(posedge clk) begin
+        //
+        modsub_x_ab_mask_now <= 1'b0;
+        modsub_y_ab_mask_now <= 1'b0;
+        //
+        modsub_x_abn_mask_now <= 1'b0;
+        modsub_y_abn_mask_now <= 1'b0;
+        //
+        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
+            //
+            case (wrk_fsm_state)
+                //
+                WRK_FSM_STATE_LATENCY_PRE2_TP: begin
+                    modsub_x_ab_mask_now <= 1'b1;
+                    modsub_y_ab_mask_now <= 1'b1;
+                end
+                //
+                WRK_FSM_STATE_LATENCY_PRE3_TP: begin
+                    modsub_x_abn_mask_now <= 1'b1;
+                    modsub_y_abn_mask_now <= 1'b1;
+                end
+                //
+            endcase
+        //     
+    end
+    
+    always @(posedge clk) begin
+        //
+        update_wide_dout  (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
+        update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
+        //
+        // one_pass
+        //
+        case (wrk_fsm_state)
+            //
+            WRK_FSM_STATE_BUSY,
+            WRK_FSM_STATE_LATENCY_POST1,
+            WRK_FSM_STATE_LATENCY_POST2:
+                //
+                case (opcode)
+                    //
+                    UOP_OPCODE_PROPAGATE_CARRIES:
+                        //
+                        update_narrow_dout(rd_narrow_x_din_x_w_cry_reduced,
+                                           rd_narrow_y_din_x_w_cry_reduced,
+                                           rd_narrow_x_din_y_w_cry_reduced,
+                                           rd_narrow_y_din_y_w_cry_reduced);
                     //
-                    WRK_FSM_STATE_BUSY,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST2: begin // TODO: post2 doesn't need update of carry, since that's the last word
+                    UOP_OPCODE_COPY_CRT_Y2X: begin
                         //
-                        rd_narrow_x_din_x_cry_r <= rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W];
-                        rd_narrow_y_din_x_cry_r <= rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W];
-                        rd_narrow_x_din_y_cry_r <= rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W];
-                        rd_narrow_y_din_y_cry_r <= rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W];
+                        update_wide_dout(wrk_rd_wide_x_din_y,
+                                         wrk_rd_wide_y_din_y,
+                                         wrk_rd_wide_x_din_y,
+                                         wrk_rd_wide_y_din_y);
                         //
-                        wr_narrow_x_dout_x <= rd_narrow_x_din_x_w_cry_reduced;
-                        wr_narrow_y_dout_x <= rd_narrow_y_din_x_w_cry_reduced;
-                        wr_narrow_x_dout_y <= rd_narrow_x_din_y_w_cry_reduced;
-                        wr_narrow_y_dout_y <= rd_narrow_y_din_y_w_cry_reduced;                       
+                        update_narrow_dout(wrk_rd_narrow_x_din_y,
+                                           wrk_rd_narrow_y_din_y,
+                                           wrk_rd_narrow_x_din_y,
+                                           wrk_rd_narrow_y_din_y);        
                         //
                     end
                     //
+                    UOP_OPCODE_MODULAR_REDUCE_INIT:
+                        //
+                        update_wide_dout(wrk_rd_narrow_x_din_x,
+                                         wrk_rd_narrow_y_din_x,
+                                         wrk_rd_narrow_x_din_y,
+                                         wrk_rd_narrow_y_din_y);
+                    //
                 endcase
+            //
+        endcase
+        //
+        // one_pass_meander
+        //
+        case (wrk_fsm_state)
+            //
+            WRK_FSM_STATE_BUSY_M2,
+            WRK_FSM_STATE_LATENCY_POST1_M2,
+            WRK_FSM_STATE_LATENCY_POST2_M2:
                 //
-            UOP_OPCODE_COPY_CRT_Y2X:
-                //
-                case (wrk_fsm_state)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_BUSY,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST2: begin
+                    UOP_OPCODE_COPY_LADDERS_X2Y: begin
                         //
-                        wr_wide_x_dout_x   <= wrk_rd_wide_x_din_y;
-                        wr_wide_y_dout_x   <= wrk_rd_wide_y_din_y;
-                        wr_wide_x_dout_y   <= wrk_rd_wide_x_din_y;
-                        wr_wide_y_dout_y   <= wrk_rd_wide_y_din_y;
+                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
+                                         wrk_rd_wide_x_din_x_dly2,
+                                         wrk_rd_wide_x_din_y_dly3,
+                                         wrk_rd_wide_x_din_y_dly2);
                         //
-                        wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_y;
-                        wr_narrow_y_dout_x <= wrk_rd_narrow_y_din_y;
-                        wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y;
-                        wr_narrow_y_dout_y <= wrk_rd_narrow_y_din_y;                       
+                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
+                                           wrk_rd_narrow_x_din_x_dly2,
+                                           wrk_rd_narrow_x_din_y_dly3,
+                                           wrk_rd_narrow_x_din_y_dly2);
                         //
                     end
                     //
-                endcase
-                //
-            UOP_OPCODE_COPY_LADDERS_X2Y:
-                //
-                case (wrk_fsm_state)
-                    //
-                    WRK_FSM_STATE_BUSY_M2,
-                    WRK_FSM_STATE_LATENCY_POST1_M2,
-                    WRK_FSM_STATE_LATENCY_POST2_M2: begin
+                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                         //
-                        wr_wide_x_dout_x <= wrk_rd_wide_x_din_x_dly3;
-                        wr_wide_y_dout_x <= wrk_rd_wide_x_din_x_dly2;
-                        wr_wide_x_dout_y <= wrk_rd_wide_x_din_y_dly3;
-                        wr_wide_y_dout_y <= wrk_rd_wide_x_din_y_dly2;
+                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
+                                         wrk_rd_wide_x_din_y_dly2,
+                                         wrk_rd_wide_x_din_y_dly3,
+                                         wrk_rd_wide_x_din_x_dly2);
                         //
-                        wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_x_dly3;
-                        wr_narrow_y_dout_x <= wrk_rd_narrow_x_din_x_dly2;
-                        wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y_dly3;
-                        wr_narrow_y_dout_y <= wrk_rd_narrow_x_din_y_dly2;
+                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
+                                           wrk_rd_narrow_x_din_y_dly2,
+                                           wrk_rd_narrow_x_din_y_dly3,
+                                           wrk_rd_narrow_x_din_x_dly2);
                         //
                     end
                     //
                 endcase
+            //
+        endcase
+        //
+        // two_pass
+        //
+        case (wrk_fsm_state)
+            //
+            WRK_FSM_STATE_BUSY_TP,
+            WRK_FSM_STATE_LATENCY_POST1_TP,
+            WRK_FSM_STATE_LATENCY_POST2_TP,
+            WRK_FSM_STATE_LATENCY_POST3_TP,
+            WRK_FSM_STATE_LATENCY_POST4_TP:
                 //
-            UOP_OPCODE_MODULAR_REDUCE_INIT:
-                //
-                case (wrk_fsm_state)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_BUSY,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST2: begin
+                    UOP_OPCODE_MODULAR_SUBTRACT:
                         //
-                        wr_wide_x_dout_x   <= wrk_rd_narrow_x_din_x;
-                        wr_wide_y_dout_x   <= wrk_rd_narrow_y_din_x;
-                        wr_wide_x_dout_y   <= wrk_rd_narrow_x_din_y;
-                        wr_wide_y_dout_y   <= wrk_rd_narrow_y_din_y;
+                        if (!wrk_fsm_two_pass_pass)
+                            update_narrow_dout(modsub_x_ab_dly_trunc, modsub_x_abn_trunc, modsub_y_ab_dly_trunc, modsub_y_abn_trunc);
+                        else begin
+                            update_wide_dout  (modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
+                            update_narrow_dout(modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
+                        end
                         //
-                    end
-                    //
                 endcase
             //
         endcase
@@ -637,254 +813,307 @@ module modexpng_general_worker
 
 
     //
-    // Write Address Logic
+    // Source Read Address Logic
     //
-    wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half;
-    wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half;
+    
+    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next;
+    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next;
+
+    wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half;
+    wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last;
+    
+    task update_rd_wide_bank_addr;
+        input [BANK_ADDR_W -1:0] bank;
+        input [  OP_ADDR_W -1:0] addr;
+        begin
+            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, addr};
+            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, addr};
+        end
+    endtask
+
+    task update_rd_wide_bank;
+        input [BANK_ADDR_W -1:0] bank;
+        begin
+            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, rd_wide_xy_addr_x};
+            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, rd_wide_xy_addr_y};
+        end
+    endtask
+    
+    task update_rd_narrow_bank_addr;
+        input [BANK_ADDR_W -1:0] bank;
+        input [  OP_ADDR_W -1:0] addr;
+        begin
+            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, addr};
+            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, addr};
+        end
+    endtask
+    
+    task update_rd_narrow_bank;
+        input [BANK_ADDR_W -1:0] bank;
+        begin
+            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, rd_narrow_xy_addr_x};
+            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, rd_narrow_xy_addr_y};
+        end
+    endtask
+    
+    task update_rd_wide_addr_next;
+        input [OP_ADDR_W -1:0] addr;
+        rd_wide_xy_addr_xy_next <= addr;
+    endtask
+
+    task update_rd_narrow_addr_next;
+        input [OP_ADDR_W -1:0] addr;
+        rd_narrow_xy_addr_xy_next <= addr;
+    endtask
+    
+    task advance_rd_wide_addr_next;
+        rd_wide_xy_addr_xy_next <= !rd_wide_xy_addr_xy_next_is_last ? rd_wide_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
+    endtask
+
+    task advance_rd_narrow_addr_next;
+        rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
+    endtask 
     
     always @(posedge clk) begin
         //
-        {wr_wide_xy_bank_x,   wr_wide_xy_addr_x }  <= {BANK_DNC, OP_ADDR_DNC};
-        {wr_wide_xy_bank_y,   wr_wide_xy_addr_y }  <= {BANK_DNC, OP_ADDR_DNC};
-        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC};
-        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC};
+        update_rd_wide_bank_addr  (BANK_DNC, OP_ADDR_DNC);
+        update_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC);
         //
-        case (opcode)
+        // one_pass
+        //
+        case (wrk_fsm_state_next_one_pass)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES,
-            UOP_OPCODE_COPY_CRT_Y2X:
+            WRK_FSM_STATE_LATENCY_PRE1:
                 //
-                case (wrk_fsm_state)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_BUSY,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST2: begin
-                        //
-                        {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly2};
-                        {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly2};                        
+                    UOP_OPCODE_PROPAGATE_CARRIES,
+                    UOP_OPCODE_OUTPUT_FROM_NARROW,
+                    UOP_OPCODE_COPY_CRT_Y2X,
+                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
                         //
-                        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly2};
-                        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly2};
+                        update_rd_wide_bank_addr  (sel_wide_in,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
+                        update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                         //
                     end
                     //
                 endcase
                 //
-            UOP_OPCODE_MODULAR_REDUCE_INIT:
+            WRK_FSM_STATE_LATENCY_PRE2,
+            WRK_FSM_STATE_BUSY:
                 //
-                case (wrk_fsm_state)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_BUSY,
-                    WRK_FSM_STATE_LATENCY_POST1,
-                    WRK_FSM_STATE_LATENCY_POST2: begin
+                    UOP_OPCODE_PROPAGATE_CARRIES,
+                    UOP_OPCODE_OUTPUT_FROM_NARROW,
+                    UOP_OPCODE_COPY_CRT_Y2X: begin
+                        //
+                        update_rd_wide_bank_addr  (sel_wide_in,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
+                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                         //
-                        wr_wide_xy_bank_x <= uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H;
-                        wr_wide_xy_bank_y <= uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H;
+                    end
+                    //
+                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
                         //
-                        wr_wide_xy_addr_x <= rd_wide_xy_addr_x_dly2;
-                        wr_wide_xy_addr_y <= rd_wide_xy_addr_y_dly2;
+                        update_rd_wide_bank_addr  (sel_wide_in,   rd_wide_xy_addr_xy_next  ); advance_rd_wide_addr_next  ;
+                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                         //
                     end
                     //
                 endcase
+            //
+        endcase
+        //
+        // one_pass_meander
+        //
+        case (wrk_fsm_state_next_one_pass_meander)
+            //
+            WRK_FSM_STATE_LATENCY_PRE1_M1:
+                case (opcode)
+                    UOP_OPCODE_COPY_LADDERS_X2Y,
+                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
+                        update_rd_wide_bank_addr  (sel_wide_out,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
+                        update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
+                    end
+                endcase
+            //
+            WRK_FSM_STATE_LATENCY_PRE2_M1,
+            WRK_FSM_STATE_BUSY_M1:
+                case (opcode)
+                    UOP_OPCODE_COPY_LADDERS_X2Y,
+                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
+                        update_rd_wide_bank_addr  (sel_wide_out,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
+                        update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
+                        //
+                    end
+                    //
+                endcase
+            //
+            WRK_FSM_STATE_LATENCY_PRE1_M2,
+            WRK_FSM_STATE_LATENCY_PRE2_M2,
+            WRK_FSM_STATE_BUSY_M2:
+                case (opcode)
+                    UOP_OPCODE_COPY_LADDERS_X2Y,
+                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
+                        update_rd_wide_bank  (sel_wide_in  );
+                        update_rd_narrow_bank(sel_narrow_in);
+                    end
+                endcase
+            //
+        endcase
+        //
+        // two_pass
+        //
+        case (wrk_fsm_state_next_two_pass)
+            //
+            WRK_FSM_STATE_LATENCY_PRE1_TP:
                 //
-            UOP_OPCODE_COPY_LADDERS_X2Y:
-                //
-                case (wrk_fsm_state)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_BUSY_M2,
-                    WRK_FSM_STATE_LATENCY_POST1_M2,
-                    WRK_FSM_STATE_LATENCY_POST2_M2: begin
-                        //
-                        {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly4};
-                        {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly4};                        
+                    UOP_OPCODE_MODULAR_SUBTRACT:
                         //
-                        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly4};
-                        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly4};
+                        if (!wrk_fsm_two_pass_pass) begin
+                            update_rd_wide_bank_addr  (BANK_WIDE_N,    OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
+                            update_rd_narrow_bank_addr(sel_narrow_in,  OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
+                        end else begin
+                            update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
+                        end
+                    //
+                endcase
+                //
+            WRK_FSM_STATE_LATENCY_PRE2_TP,
+            WRK_FSM_STATE_LATENCY_PRE3_TP,
+            WRK_FSM_STATE_LATENCY_PRE4_TP,
+            WRK_FSM_STATE_BUSY_TP:
+                //
+                case (opcode)
+                    //
+                    UOP_OPCODE_MODULAR_SUBTRACT:
                         //
-                    end
+                        if (!wrk_fsm_two_pass_pass) begin
+                            update_rd_wide_bank_addr  (BANK_WIDE_N,    rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
+                            update_rd_narrow_bank_addr(sel_narrow_in,  rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
+                        end else begin
+                            update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
+                        end
                     //
                 endcase
                 //
-            //
         endcase
         //
     end
 
 
     //
-    // Read Address Logic
+    // Destination Write Address Logic
     //
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_next;
-    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_next;
-
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_next;
-    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_next;
+    
+    wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half;
+    wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half;
 
-    wire rd_wide_xy_addr_x_next_is_last = rd_wide_xy_addr_x_next == word_index_last_half;
-    wire rd_wide_xy_addr_y_next_is_last = rd_wide_xy_addr_y_next == word_index_last_half;
+    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_x = uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H;
+    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_y = uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H;
 
-    wire rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last;
-    wire rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last;
+    task update_wr_wide_bank_addr;
+        input [BANK_ADDR_W -1:0] x_bank;
+        input [BANK_ADDR_W -1:0] y_bank;
+        input [  OP_ADDR_W -1:0] x_addr;
+        input [  OP_ADDR_W -1:0] y_addr;
+        begin
+            {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {x_bank, x_addr};
+            {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {y_bank, y_addr};
+        end
+    endtask
     
-    always @(posedge clk) begin // TODO: Maybe split into two blocks (read address / next address)??
+    task update_wr_narrow_bank_addr;
+        input [BANK_ADDR_W -1:0] x_bank;
+        input [BANK_ADDR_W -1:0] y_bank;
+        input [  OP_ADDR_W -1:0] x_addr;
+        input [  OP_ADDR_W -1:0] y_addr;
+        begin
+            {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {x_bank, x_addr};
+            {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {y_bank, y_addr};
+        end
+    endtask
+    
+    always @(posedge clk) begin
         //
-        {rd_wide_xy_bank_x,   rd_wide_xy_addr_x  } <= {BANK_DNC, OP_ADDR_DNC}; // TODO: Add same default path for io_manager ??
-        {rd_wide_xy_bank_y,   rd_wide_xy_addr_y  } <= {BANK_DNC, OP_ADDR_DNC};
-        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC};
-        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC};
+        update_wr_wide_bank_addr  (BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
+        update_wr_narrow_bank_addr(BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
         //
-        case (opcode)
+        // one_pass
+        //
+        case (wrk_fsm_state)
             //
-            UOP_OPCODE_PROPAGATE_CARRIES,
-            UOP_OPCODE_OUTPUT_FROM_NARROW,
-            UOP_OPCODE_COPY_CRT_Y2X:
+            WRK_FSM_STATE_BUSY,
+            WRK_FSM_STATE_LATENCY_POST1,
+            WRK_FSM_STATE_LATENCY_POST2:
                 //
-                case (wrk_fsm_state_next_one_pass)
+                case (opcode)
                     //
-                    WRK_FSM_STATE_LATENCY_PRE1: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO};
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO};
-                        //
-                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
-                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
+                    UOP_OPCODE_PROPAGATE_CARRIES,
+                    UOP_OPCODE_COPY_CRT_Y2X: begin
+                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
+                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
                     end
                     //
-                    WRK_FSM_STATE_LATENCY_PRE2,
-                    WRK_FSM_STATE_BUSY: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_narrow_xy_addr_x_next};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_narrow_xy_addr_y_next};                        
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next};
-                        //
-                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
-                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
-                        //
-                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
-                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
-                        //
-                    end
+                    UOP_OPCODE_MODULAR_REDUCE_INIT:
+                        update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2);                    
                     //
                 endcase
                 //
-            UOP_OPCODE_MODULAR_REDUCE_INIT:
-                //
-                case (wrk_fsm_state_next_one_pass)
-                    //
-                    WRK_FSM_STATE_LATENCY_PRE1: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO};
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO};
-                        //
-                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
-                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
-                    end
-                    //
-                    WRK_FSM_STATE_LATENCY_PRE2,
-                    WRK_FSM_STATE_BUSY: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x_next};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y_next};                        
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next};
-                        //
-                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
-                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
-                        //
-                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
-                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
-                        //
+        endcase
+        //
+        // one_pass_meander
+        //
+        case (wrk_fsm_state)
+            //
+            WRK_FSM_STATE_BUSY_M2,
+            WRK_FSM_STATE_LATENCY_POST1_M2,
+            WRK_FSM_STATE_LATENCY_POST2_M2:
+                //        
+                case (opcode)
+                    UOP_OPCODE_COPY_LADDERS_X2Y,
+                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
+                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
+                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
                     end
-                    //
                 endcase
                 //
-            UOP_OPCODE_COPY_LADDERS_X2Y:
+        endcase
+        //
+        // two_pass
+        //
+        case (wrk_fsm_state)
+            //
+            WRK_FSM_STATE_BUSY_TP,
+            WRK_FSM_STATE_LATENCY_POST1_TP,
+            WRK_FSM_STATE_LATENCY_POST2_TP,
+            WRK_FSM_STATE_LATENCY_POST3_TP,
+            WRK_FSM_STATE_LATENCY_POST4_TP:
                 //
-                case (wrk_fsm_state_next_one_pass_meander)
-                    //
-                    WRK_FSM_STATE_LATENCY_PRE1_M1: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, OP_ADDR_ZERO};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, OP_ADDR_ZERO};
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, OP_ADDR_ZERO};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, OP_ADDR_ZERO};
-                        //
-                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
-                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
-                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
-                        //
-                    end
+                case (opcode)
                     //
-                    WRK_FSM_STATE_LATENCY_PRE1_M2: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y};
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y};
-                        //
-                    end
-                    //
-                    WRK_FSM_STATE_LATENCY_PRE2_M1,
-                    WRK_FSM_STATE_BUSY_M1: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_next};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_next};                        
+                    UOP_OPCODE_MODULAR_SUBTRACT:
                         //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_next};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_next};
+                        if (!wrk_fsm_two_pass_pass) begin
+                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);                     
+                        end else begin
+                            update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
+                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
+                        end 
                         //
-                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
-                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
-                        //
-                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
-                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
-                        //
-                    end
-                    //
-                    WRK_FSM_STATE_LATENCY_PRE2_M2,
-                    WRK_FSM_STATE_BUSY_M2: begin
-                        //
-                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x};
-                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y};
-                        //
-                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x};
-                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y};
-                        //
-                    end
-                    //
-                endcase
+                    endcase
                 //
-            //
-        endcase
+            endcase
         //
     end
-    
+
 
     //
     // FSM Process
     //
+
     always @(posedge clk)
         //
         if (rst) wrk_fsm_state <= WRK_FSM_STATE_IDLE;
@@ -893,7 +1122,9 @@ module modexpng_general_worker
             UOP_OPCODE_OUTPUT_FROM_NARROW,
             UOP_OPCODE_COPY_CRT_Y2X,
             UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass;
-            UOP_OPCODE_COPY_LADDERS_X2Y:    wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander;
+            UOP_OPCODE_COPY_LADDERS_X2Y,
+            UOP_OPCODE_CROSS_LADDERS_X2Y:   wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander;
+            UOP_OPCODE_MODULAR_SUBTRACT:    wrk_fsm_state <= wrk_fsm_state_next_two_pass;
             default:                        wrk_fsm_state <= WRK_FSM_STATE_IDLE;
         endcase
     
@@ -901,49 +1132,64 @@ module modexpng_general_worker
     //
     // Busy Exit Logic
     //
-    reg wrk_fsm_done_one_pass         = 1'b0; 
+    
+    reg wrk_fsm_done_one_pass         = 1'b0;
     reg wrk_fsm_done_one_pass_meander = 1'b0;
+    reg wrk_fsm_done_two_pass         = 1'b0;
     
     always @(posedge clk) begin
         //
         wrk_fsm_done_one_pass         <= 1'b0;
         wrk_fsm_done_one_pass_meander <= 1'b0;
+        wrk_fsm_done_two_pass         <= 1'b0;
         //
         case (opcode)
             //
             UOP_OPCODE_PROPAGATE_CARRIES,
             UOP_OPCODE_OUTPUT_FROM_NARROW,
             UOP_OPCODE_COPY_CRT_Y2X,
-            UOP_OPCODE_MODULAR_REDUCE_INIT: begin
+            UOP_OPCODE_MODULAR_REDUCE_INIT:
                 //
-                if (wrk_fsm_state == WRK_FSM_STATE_BUSY) begin
-                    //
-                    if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass <= 1'b1; // TODO: Check, whether both are necessary...
-                    if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass <= 1'b1;
-                    //
-                end
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY:
+                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass <= 1'b1;
+                endcase
                 //
-            end
-            //
-            UOP_OPCODE_COPY_LADDERS_X2Y: begin
+            UOP_OPCODE_COPY_LADDERS_X2Y,
+            UOP_OPCODE_CROSS_LADDERS_X2Y:
                 //
-                if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M2) begin
-                    //
-                    if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // TODO: Check, whether both are necessary...
-                    if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1;
-                    //
-                end
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY_M2:
+                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1;
+                    WRK_FSM_STATE_BUSY_M1:
+                        wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander;
+                endcase
+                //
+            UOP_OPCODE_MODULAR_SUBTRACT:
                 //
-                if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M1)
-                    wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander;
+                case (wrk_fsm_state)
+                    WRK_FSM_STATE_BUSY_TP:
+                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_two_pass <= 1'b1;
+                endcase
                 //
-            end
             //
         endcase
         //
     end
     
-        
+    
+    //
+    // FSM Helper Logic
+    //    
+    always @(posedge clk)
+        //
+        case (wrk_fsm_state)
+            WRK_FSM_STATE_IDLE: if (ena)    {wrk_fsm_two_pass_pass, wrk_fsm_two_pass_pass_dly} <= {1'b0, 1'b0};
+            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_two_pass_pass <= 1'b1;
+            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_two_pass_pass_dly <= 1'b1;
+        endcase
+
+
     //
     // FSM Transition Logic
     //
@@ -985,7 +1231,27 @@ module modexpng_general_worker
         //
     end
     
-
+    always @* begin
+        //
+        case (wrk_fsm_state)
+            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_two_pass = ena                       ? WRK_FSM_STATE_LATENCY_PRE1_TP  : WRK_FSM_STATE_IDLE;
+            WRK_FSM_STATE_LATENCY_PRE1_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE2_TP  ;
+            WRK_FSM_STATE_LATENCY_PRE2_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE3_TP  ;
+            WRK_FSM_STATE_LATENCY_PRE3_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE4_TP  ;
+            WRK_FSM_STATE_LATENCY_PRE4_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_BUSY_TP          ;
+            WRK_FSM_STATE_BUSY_TP:          wrk_fsm_state_next_two_pass = wrk_fsm_done_two_pass ?     WRK_FSM_STATE_LATENCY_POST1_TP : WRK_FSM_STATE_BUSY_TP;
+            WRK_FSM_STATE_LATENCY_POST1_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST2_TP ;
+            WRK_FSM_STATE_LATENCY_POST2_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST3_TP ;
+            WRK_FSM_STATE_LATENCY_POST3_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST4_TP ;
+            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_HOLDOFF_TP       ;
+            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_state_next_two_pass = wrk_fsm_two_pass_pass_dly ? WRK_FSM_STATE_STOP             : WRK_FSM_STATE_LATENCY_PRE1_TP; 
+            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
+            default:                        wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
+        endcase
+        //
+    end
+    
+    
     //
     // Ready Logic
     //
diff --git a/rtl/modexpng_microcode.vh b/rtl/modexpng_microcode.vh
index f68c559..3493e26 100644
--- a/rtl/modexpng_microcode.vh
+++ b/rtl/modexpng_microcode.vh
@@ -39,8 +39,9 @@ localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_OUTPUT_FROM_NARROW = 5'd3;
  * source and destination WIDE are don't care
  */
 
-localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_CRT_Y2X     = 5'd4;
-localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_LADDERS_X2Y = 5'd5;
+localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_CRT_Y2X      = 5'd4;
+localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_LADDERS_X2Y  = 5'd5;
+localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_CROSS_LADDERS_X2Y = 5'd7;
 /* CRT is don't care
  * NPQ specifies the width of the operand
  * AUX is don't care
@@ -53,6 +54,13 @@ localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_MULTIPLY = 5'd8;
  * AUX = AUX_2 forces B input to 1 (AUX_1 reads from source NARROW as usual)
  * LADDER specifies Montgomery ladder mode 
  */
+localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_SUBTRACT = 5'd9;
+/* CRT is don't care
+ * NPQ specifies the width of the operand
+ * AUX is don't care
+ * LADDER is don't care
+ */
+ 
 localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_REDUCE_INIT = 5'd10;
 localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_REDUCE_PROC = 5'd11;
 /* CRT
diff --git a/rtl/modexpng_uop_rom.v b/rtl/modexpng_uop_rom.v
index 04f0c83..adc657a 100644
--- a/rtl/modexpng_uop_rom.v
+++ b/rtl/modexpng_uop_rom.v
@@ -21,35 +21,35 @@ module modexpng_uop_rom
             6'd03:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_Y,        BANK_WIDE_A,    BANK_DNC         }; //
             6'd04:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_X,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_M,        BANK_WIDE_E,    BANK_DNC         }; //
             6'd05:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_M,        BANK_WIDE_E,    BANK_DNC         }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd06:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_X,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_N_COEFF,  BANK_DNC,       BANK_NARROW_COEFF}; //
             6'd07:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_N_COEFF,  BANK_DNC,       BANK_NARROW_COEFF}; //
             6'd08:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_X,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_N_FACTOR, BANK_DNC,       BANK_NARROW_A    }; //
             6'd09:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_N_FACTOR, BANK_DNC,       BANK_NARROW_A    }; //
             6'd10:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_X,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_M,        BANK_DNC,       BANK_NARROW_E    }; //
             6'd11:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_1_M,        BANK_DNC,       BANK_NARROW_E    }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd12:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_11,  BANK_WIDE_A, BANK_NARROW_A,      BANK_WIDE_B,    BANK_NARROW_B    }; //
             6'd13:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_11,  BANK_WIDE_B, BANK_NARROW_B,      BANK_WIDE_C,    BANK_NARROW_C    }; //
             6'd14:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_2,   UOP_LADDER_11,  BANK_WIDE_C, BANK_DNC,           BANK_WIDE_D,    BANK_NARROW_D    }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd15:   data <= {UOP_OPCODE_PROPAGATE_CARRIES,   UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_D,      BANK_DNC,       BANK_NARROW_D    }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd16:   data <= {UOP_OPCODE_OUTPUT_FROM_NARROW,  UOP_CRT_X,   UOP_NPQ_N,   UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_D,      BANK_DNC,       BANK_OUT_XM      }; //
             6'd17:   data <= {UOP_OPCODE_OUTPUT_FROM_NARROW,  UOP_CRT_Y,   UOP_NPQ_N,   UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_D,      BANK_DNC,       BANK_OUT_YM      }; //
-                                                                                                                                                                                         //            
+                                                                                                                                                                                          //            
             6'd18:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_1,   UOP_LADDER_11,  BANK_WIDE_E, BANK_NARROW_B,      BANK_WIDE_C,    BANK_NARROW_C    }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd19:   data <= {UOP_OPCODE_PROPAGATE_CARRIES,   UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_C,      BANK_DNC,       BANK_NARROW_C    }; //
-                                                                                                                                                                                         //            
+                                                                                                                                                                                          //            
             6'd20:   data <= {UOP_OPCODE_COPY_CRT_Y2X,        UOP_CRT_DNC, UOP_NPQ_N,   UOP_AUX_DNC, UOP_LADDER_DNC, BANK_WIDE_C, BANK_NARROW_C,      BANK_WIDE_C,    BANK_NARROW_C    }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd21:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_X,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_P,        BANK_WIDE_N,    BANK_DNC         }; //
             6'd22:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_Y,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_Q,        BANK_WIDE_N,    BANK_DNC         }; //
             6'd23:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_X,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_P_FACTOR, BANK_WIDE_A,    BANK_DNC         }; //
             6'd24:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_Y,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_Q_FACTOR, BANK_WIDE_A,    BANK_DNC         }; //
             6'd25:   data <= {UOP_OPCODE_INPUT_TO_WIDE,       UOP_CRT_X,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_QINV,     BANK_WIDE_E,    BANK_DNC         }; //
-                                                                                                                                                                                         //
+                                                                                                                                                                                          //
             6'd26:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_X,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_P_COEFF,  BANK_DNC,       BANK_NARROW_COEFF}; //
             6'd27:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_Y,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_Q_COEFF,  BANK_DNC,       BANK_NARROW_COEFF}; //
             6'd28:   data <= {UOP_OPCODE_INPUT_TO_NARROW,     UOP_CRT_X,   UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_DNC, BANK_DNC,    BANK_IN_2_P_FACTOR, BANK_DNC,       BANK_NARROW_A    }; //
@@ -70,6 +70,14 @@ module modexpng_uop_rom
             6'd38:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_PQ,  UOP_AUX_1,   UOP_LADDER_PQ,  BANK_WIDE_C, BANK_NARROW_C,      BANK_WIDE_C,    BANK_NARROW_C    }; //
             6'd39:   data <= {UOP_OPCODE_LADDER_STEP,         UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, UOP_SEL_DNC_ALL                                                   }; //
                                                                                                                                                                                           //
+            6'd40:   data <= {UOP_OPCODE_MODULAR_MULTIPLY,    UOP_CRT_DNC, UOP_NPQ_PQ,  UOP_AUX_2,   UOP_LADDER_11,  BANK_WIDE_C, BANK_DNC,           BANK_WIDE_D,    BANK_NARROW_D    }; //
+                                                                                                                                                                                          //
+            6'd41:   data <= {UOP_OPCODE_PROPAGATE_CARRIES,   UOP_CRT_DNC, UOP_NPQ_PQ,  UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_D,      BANK_DNC,       BANK_NARROW_D    }; //
+                                                                                                                                                                                          //
+            6'd42:   data <= {UOP_OPCODE_CROSS_LADDERS_X2Y,   UOP_CRT_DNC, UOP_NPQ_PQ,  UOP_AUX_DNC, UOP_LADDER_DNC, BANK_WIDE_D, BANK_NARROW_D,      BANK_WIDE_D,    BANK_NARROW_D    }; //
+                                                                                                                                                                                          //
+            6'd43:   data <= {UOP_OPCODE_MODULAR_SUBTRACT,    UOP_CRT_DNC, UOP_NPQ_PQ,  UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC,    BANK_NARROW_D,      BANK_WIDE_C,    BANK_NARROW_C    }; //
+                                                                                                                                                                                          //
             default: data <= {UOP_OPCODE_STOP,                UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, UOP_SEL_DNC_ALL                                                   }; //                                            
         endcase
 



More information about the Commits mailing list