[Cryptech-Commits] [core/math/modexpng] 28/92: Major rewrite (different core hierarchy, buses, wrappers, etc).

git at cryptech.is git at cryptech.is
Sat Mar 14 18:19:07 UTC 2020


This is an automated email from the git hooks/post-receive script.

paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.

commit fde62e373fdfcefefb7da10757a3db933160c911
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Oct 1 15:16:58 2019 +0300

    Major rewrite (different core hierarchy, buses, wrappers, etc).
---
 rtl/modexpng_dsp_array_block.v                     |  143 +++
 rtl/modexpng_dsp_slice_wrapper.v                   |  125 ++
 rtl/modexpng_mac.v                                 |   54 -
 rtl/modexpng_mac_array.v                           |  116 --
 rtl/modexpng_mem.v                                 |   93 --
 rtl/modexpng_mmm_col_index.v                       |   90 --
 rtl/modexpng_mmm_din_addr.v                        |  167 ---
 rtl/modexpng_mmm_dout_addr.v                       |  167 ---
 rtl/modexpng_mmm_dual_x8.v                         |  951 +++++++++++++++
 ...modexpng_mmm_fsm.vh => modexpng_mmm_fsm_old.vh} |    0
 rtl/modexpng_mmm_pad.v                             |  153 ---
 rtl/modexpng_mmm_transporter.v                     |  157 ---
 rtl/modexpng_mmm_x8_dual.v                         |  550 ---------
 rtl/modexpng_parameters.vh                         |   39 -
 rtl/modexpng_parameters_old.vh                     |   40 +
 ...ameters_x8.vh => modexpng_parameters_x8_old.vh} |    0
 rtl/modexpng_part_recombinator.v                   | 1128 ------------------
 rtl/modexpng_recombinator_block.v                  | 1234 +++++++++++++++++++-
 ...inator_block.v => modexpng_recombinator_cell.v} |    2 +-
 rtl/modexpng_reductor.v                            |  270 +++++
 rtl/modexpng_storage_block.v                       |  226 ++++
 rtl/modexpng_storage_manager.v                     |  200 ++++
 22 files changed, 3168 insertions(+), 2737 deletions(-)

diff --git a/rtl/modexpng_dsp_array_block.v b/rtl/modexpng_dsp_array_block.v
new file mode 100644
index 0000000..9c4ee93
--- /dev/null
+++ b/rtl/modexpng_dsp_array_block.v
@@ -0,0 +1,143 @@
+module modexpng_dsp_array_block
+(
+    input             clk,
+    
+    input             ce_a,
+    input             ce_b,
+    input             ce_m,
+    input             ce_p,
+    input             ce_mode,
+
+    input  [9   -1:0] mode_z,
+    
+    input  [5*18-1:0] a,
+    input  [1*16-1:0] b,
+    output [9*47-1:0] p
+);
+
+    `include "modexpng_parameters_x8.vh"
+    
+    wire [17:0] casc_a[0:3];
+    wire [15:0] casc_b[0:3];
+    
+    wire ce_a0 = ce_a;
+    reg  ce_a1 = 1'b0;
+    reg  ce_a2 = 1'b0;
+    
+    wire ce_b0 = ce_b;
+    reg  ce_b1 = 1'b0;
+    
+    always @(posedge clk) begin
+        ce_a1 <= ce_a0;
+        ce_a2 <= ce_a1;
+        ce_b1 <= ce_b0;
+    end
+        
+    
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        //
+        begin : gen_DSP48E1
+            //        
+            modexpng_dsp_slice_wrapper #
+            (
+                .AB_INPUT("DIRECT"),
+                .B_REG(2)
+            )
+            dsp_direct
+            (
+                .clk            (clk),
+                
+                .ce_a1          (ce_a0),
+                .ce_b1          (ce_b0),
+                .ce_a2          (ce_a1),
+                .ce_b2          (ce_b1),
+                .ce_m           (ce_m),
+                .ce_p           (ce_p),
+                .ce_mode        (ce_mode),
+                
+                .a              (a[z*18+:18]),
+                .b              (b),
+                .p              (p[47*2*z+:47]),
+                
+                .inmode         (5'b00000),
+                .opmode         ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}),
+                .alumode        (4'b0000),
+                
+                .casc_a_in      ({18{1'b0}}),
+                .casc_b_in      ({16{1'b0}}),
+                
+                .casc_a_out     (casc_a[z]),
+                .casc_b_out     (casc_b[z])
+            );
+            //
+            modexpng_dsp_slice_wrapper #
+            (
+                .AB_INPUT("CASCADE"),
+                .B_REG(1)
+            )
+            dsp_cascade
+            (
+                .clk            (clk),
+                
+                .ce_a1          (ce_a1),
+                .ce_b1          (1'b0),
+                .ce_a2          (ce_a2),
+                .ce_b2          (ce_b1),
+                .ce_m           (ce_m),
+                .ce_p           (ce_p),
+                .ce_mode        (ce_mode),
+                
+                .a              (a[z*18+:18]),
+                .b              (b),
+                .p              (p[47*(2*z+1)+:47]),
+                
+                .inmode         (5'b00000),
+                .opmode         ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}),
+                .alumode        (4'b0000),
+                
+                .casc_a_in      (casc_a[z]),
+                .casc_b_in      (casc_b[z]),
+                
+                .casc_a_out     (),
+                .casc_b_out     ()
+            );
+            //            
+        end
+        //
+    endgenerate
+
+    modexpng_dsp_slice_wrapper #
+    (
+        .AB_INPUT("DIRECT"),
+        .B_REG(2)
+    )
+    dsp_aux
+    (
+        .clk            (clk),
+        
+        .ce_a1          (ce_a0),
+        .ce_b1          (ce_b0),
+        .ce_a2          (ce_a1),
+        .ce_b2          (ce_b1),
+        .ce_m           (ce_m),
+        .ce_p           (ce_p),
+        .ce_mode        (ce_mode),
+        
+        .a              (a[4*18+:18]),
+        .b              (b),
+        .p              (p[47*2*4+:47]),
+        
+        .inmode         (5'b00000),
+        .opmode         ({1'b0, mode_z[2*4], 1'b0, 2'b01, 2'b01}),
+        .alumode        (4'b0000),
+        
+        .casc_a_in      ({18{1'b0}}),
+        .casc_b_in      ({16{1'b0}}),
+        
+        .casc_a_out     (),
+        .casc_b_out     ()
+    );
+
+
+endmodule
diff --git a/rtl/modexpng_dsp_slice_wrapper.v b/rtl/modexpng_dsp_slice_wrapper.v
new file mode 100644
index 0000000..f565eec
--- /dev/null
+++ b/rtl/modexpng_dsp_slice_wrapper.v
@@ -0,0 +1,125 @@
+module modexpng_dsp_slice_wrapper #
+(
+    AB_INPUT   = "DIRECT",
+    B_REG      = 2
+)
+(
+    input         clk,
+    input         ce_a1,
+    input         ce_b1,
+    input         ce_a2,
+    input         ce_b2,
+    input         ce_m,
+    input         ce_p,
+    input         ce_mode,
+    input  [17:0] a,
+    input  [15:0] b,
+    output [46:0] p,
+    input  [ 4:0] inmode,
+    input  [ 6:0] opmode,
+    input  [ 3:0] alumode,
+    input  [17:0] casc_a_in,
+    input  [15:0] casc_b_in,
+    output [17:0] casc_a_out,
+    output [15:0] casc_b_out
+);
+
+    wire [30-18-1:0] casc_a_dummy;
+    wire [18-16-1:0] casc_b_dummy;
+    wire [48-47-1:0] p_dummy;
+    
+    DSP48E1 #
+    (
+        .AREG                   (2),
+        .BREG                   (B_REG),
+        .CREG                   (0),
+        .DREG                   (0),
+        .ADREG                  (0),
+        .MREG                   (1),
+        .PREG                   (1),
+        .ACASCREG               (1),
+        .BCASCREG               (1),
+        .INMODEREG              (0),
+        .OPMODEREG              (1),
+        .ALUMODEREG             (0),
+        .CARRYINREG             (0),
+        .CARRYINSELREG          (0),
+
+        .A_INPUT                (AB_INPUT),
+        .B_INPUT                (AB_INPUT),
+
+        .USE_DPORT              ("FALSE"),
+        .USE_MULT               ("DYNAMIC"),
+        .USE_SIMD               ("ONE48"),
+
+        .MASK                   (48'h3fffffffffff),
+        .PATTERN                (48'h000000000000),
+        .SEL_MASK               ("MASK"),
+        .SEL_PATTERN            ("PATTERN"),
+        
+        .USE_PATTERN_DETECT     ("NO_PATDET"),
+        .AUTORESET_PATDET       ("NO_RESET")
+    )
+    DSP48E1_inst
+    (
+        .CLK                (clk),
+    
+        .CEA1               (ce_a1),
+        .CEB1               (ce_b1),
+        .CEA2               (ce_a2),
+        .CEB2               (ce_b2),
+        .CEAD               (1'b0),
+        .CEC                (1'b0),
+        .CED                (1'b0),
+        .CEM                (ce_m),
+        .CEP                (ce_p),
+        .CEINMODE           (1'b0),
+        .CECTRL             (ce_mode),
+        .CEALUMODE          (1'b0),
+        .CECARRYIN          (1'b0),
+
+        .A                  ({{(30-18){1'b0}}, a}),
+        .B                  ({{(18-16){1'b0}}, b}),
+        .C                  ({48{1'b0}}),
+        .D                  ({25{1'b0}}),
+        .P                  ({p_dummy, p}),
+        
+        .INMODE             (inmode),
+        .OPMODE             (opmode),
+        .ALUMODE            (alumode),
+
+        .ACIN               ({{(30-18){1'b0}}, casc_a_in}),
+        .BCIN               ({{(18-16){1'b0}}, casc_b_in}),
+        .ACOUT              ({casc_a_dummy, casc_a_out}),
+        .BCOUT              ({casc_b_dummy, casc_b_out}),
+        .PCIN               ({48{1'b0}}),
+        .PCOUT              (),
+        .CARRYCASCIN        (1'b0),
+        .CARRYCASCOUT       (),
+ 
+        .RSTA               (1'b0),
+        .RSTB               (1'b0),
+        .RSTC               (1'b0),
+        .RSTD               (1'b0),
+        .RSTM               (1'b0),
+        .RSTP               (1'b0),
+        .RSTINMODE          (1'b0),
+        .RSTCTRL            (1'b0),
+        .RSTALUMODE         (1'b0),
+        .RSTALLCARRYIN      (1'b0),
+
+        .UNDERFLOW          (),
+        .OVERFLOW           (),
+        .PATTERNDETECT      (),
+        .PATTERNBDETECT     (),
+
+        .CARRYIN            (1'b0),
+        .CARRYOUT           (),
+        .CARRYINSEL         (3'b000),
+
+        .MULTSIGNIN         (1'b0),
+        .MULTSIGNOUT        ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v
deleted file mode 100644
index 9105dab..0000000
--- a/rtl/modexpng_mac.v
+++ /dev/null
@@ -1,54 +0,0 @@
-module modexpng_mac
-(
-    clk,
-    ce, clr,
-    casc_a,
-    a_in, b_in, p_out,
-    a_casc_in, a_casc_out
-);
-
-    input           clk;
-    input           ce;
-    input           clr;
-    input           casc_a;
-    input   [16:0]  a_in;
-    input   [16:0]  b_in;
-    output  [46:0]  p_out;
-    input   [16:0]  a_casc_in;
-    output  [16:0]  a_casc_out;
-    
-    reg [16:0] a_reg;
-    reg [16:0] b_reg;
-    assign a_casc_out = a_reg;
-    always @(posedge clk)
-        //
-        if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in};
-        
-    reg ce_dly1;
-    reg ce_dly2;
-    always @(posedge clk)
-        //
-        {ce_dly2, ce_dly1} <= {ce_dly1, ce};
-        
-    reg clr_dly1;
-    reg clr_dly2;
-    always @(posedge clk) begin
-        //
-        if (ce)      clr_dly1 <= clr;
-        if (ce_dly1) clr_dly2 <= clr_dly1;
-        //
-    end        
-        
-    reg  [33:0] m_reg;
-    wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg};
-    always @(posedge clk)
-        //
-        if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg};
-        
-    reg [46:0] p_reg;
-    assign p_out = p_reg;
-    always @(posedge clk)
-        //
-        if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext;
- 
-endmodule
diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v
deleted file mode 100644
index 067929e..0000000
--- a/rtl/modexpng_mac_array.v
+++ /dev/null
@@ -1,116 +0,0 @@
-module modexpng_mac_array
-(
-    clk,
-    ce, clr,
-    ce_aux, clr_aux,
-    casc_a, casc_a_aux,
-    a_in, b_in, p_out,
-    a_in_aux, p_out_aux
-);
-    
-    
-    //
-    // Includes
-    //
-    `include "modexpng_parameters.vh"
-    `include "modexpng_parameters_x8.vh"
-
-    
-    //
-    // Ports
-    //
-    input                                clk;
-    input                                ce;
-    input  [NUM_MULTS              -1:0] clr;
-    input                                ce_aux;
-    input                                clr_aux;
-    input  [NUM_MULTS              -2:0] casc_a;
-    input                                casc_a_aux;
-    input  [NUM_MULTS * WORD_WIDTH -1:0] a_in;
-    input  [        1 * WORD_WIDTH -1:0] b_in;
-    output [NUM_MULTS * MAC_WIDTH  -1:0] p_out;
-    input  [        1 * WORD_WIDTH -1:0] a_in_aux;
-    output [        1 * MAC_WIDTH  -1:0] p_out_aux;
-  
-
-    //
-    // A-Cascade Paths
-    //
-    wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2];
-    wire [WORD_WIDTH-1:0] a_casc_int_aux;
-    
-
-    //
-    // LSB
-    //
-    modexpng_mac mac_lsb
-    (
-        .clk        (clk),
-        .ce         (ce),
-        .clr        (clr[0]),
-        .casc_a     (1'b0),
-        .a_in       (a_in[0+:WORD_WIDTH]),
-        .b_in       (b_in),
-        .p_out      (p_out[0+:MAC_WIDTH]),
-        .a_casc_in  ({WORD_WIDTH{1'b0}}),
-        .a_casc_out (a_casc_int[0])
-    );
-    
-    
-    //
-    // INT
-    //
-    genvar z;
-    generate for (z=1; z<(NUM_MULTS-1); z=z+1)
-        begin : gen_modexpng_mac_int
-            modexpng_mac mac_int
-            (
-                .clk        (clk),
-                .ce         (ce),
-                .clr        (clr[z]),
-                .casc_a     (casc_a[z-1]),
-                .a_in       (a_in[z*WORD_WIDTH+:WORD_WIDTH]),
-                .b_in       (b_in),
-                .p_out      (p_out[z*MAC_WIDTH+:MAC_WIDTH]),
-                .a_casc_in  (a_casc_int[z-1]),
-                .a_casc_out (a_casc_int[z])
-            );        
-        end
-    endgenerate
-    
-    
-    //
-    // MSB
-    //
-    modexpng_mac mac_msb
-    (
-        .clk        (clk),
-        .ce         (ce),
-        .clr        (clr[NUM_MULTS-1]),
-        .casc_a     (casc_a[NUM_MULTS-2]),
-        .a_in       (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]),
-        .b_in       (b_in),
-        .p_out      (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]),
-        .a_casc_in  (a_casc_int[NUM_MULTS-2]),
-        .a_casc_out (a_casc_int_aux)
-    );
-
-    
-    //
-    // AUX
-    //
-    modexpng_mac mac_aux
-    (
-        .clk        (clk),
-        .ce         (ce_aux),
-        .clr        (clr_aux),
-        .casc_a     (casc_a_aux),
-        .a_in       (a_in_aux),
-        .b_in       (b_in),
-        .p_out      (p_out_aux),
-        .a_casc_in  (a_casc_int_aux),
-        .a_casc_out ()
-    );
-
-    
-endmodule
diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v
deleted file mode 100644
index ca89214..0000000
--- a/rtl/modexpng_mem.v
+++ /dev/null
@@ -1,93 +0,0 @@
-//
-// TODO: Add license text!
-//
-
-module modexpng_mem #
-(
-    parameter MEM_WIDTH     = 17,
-    parameter MEM_ADDR_BITS =  6
-)
-(
-    input   clk,
-
-    input  [MEM_ADDR_BITS-1:0] a_addr,
-    input                      a_en,
-    input                      a_wr,
-    input  [MEM_WIDTH    -1:0] a_in,
-    output [MEM_WIDTH    -1:0] a_out,
-
-    input  [MEM_ADDR_BITS-1:0] b_addr,
-    input                      b_en,
-    input                      b_reg_en,
-    output [MEM_WIDTH    -1:0] b_out
-);
-
-
-    //
-    // BRAM
-    //
-    (* RAM_STYLE="BLOCK" *)
-    reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
-
-
-    //
-    // Initialization for Simulation
-    //
-    /*
-    integer c;
-    initial begin
-        for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
-        bram[c] = {MEM_WIDTH{1'b0}};
-    end
-    */
-
-
-
-    //
-    // Output Registers
-    //
-    reg [MEM_WIDTH-1:0] bram_b;
-    reg [MEM_WIDTH-1:0] bram_b_reg;
-
-    assign a_out = 32'hDEADCE11;
-    assign b_out = bram_b_reg;
-
-    
-    //
-    // Note, that when both ports are accessing the same location, conflict can
-    // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict
-    // Avoidance") for more information. In our configuration to avoid that the
-    // write port must be coded to operate in READ_FIRST mode. If the write
-    // port is overwriting the same address the read port is accessing, the 
-    // write port must read the previously stored data (not the data it is
-    // writing, as that would be WRITE_FIRST mode).
-    //
-
-
-    //
-    // Write-Only Port A
-    //
-    always @(posedge clk)
-        //
-        if (a_en)
-            //
-            if (a_wr) bram[a_addr] <= a_in;
-
-
-    //
-    // Read-Only Port B
-    //
-    always @(posedge clk)
-        //
-        if (b_en)
-            //
-            bram_b <= bram[b_addr];
-
-    always @(posedge clk)
-        //
-        if (b_reg_en)
-            //
-            bram_b_reg <= bram_b;
-
-
-endmodule
diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v
deleted file mode 100644
index b904795..0000000
--- a/rtl/modexpng_mmm_col_index.v
+++ /dev/null
@@ -1,90 +0,0 @@
-module modexpng_mmm_col_index
-(
-    clk,
-    index_last,
-    fsm_state_next,
-    col_index,
-    col_index_done,
-    col_index_zero,
-    col_index_next,
-    col_index_prev
-);
-
-
-    //
-    // Includes
-    //
-    //`include "modexpng_parameters.vh"
-    //`include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-    
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-
-    
-    //
-    // Ports
-    //
-    input                        clk;
-    input  [    INDEX_WIDTH-1:0] index_last;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
-    output [    INDEX_WIDTH-4:0] col_index;
-    output                       col_index_done;
-    output [    INDEX_WIDTH-4:0] col_index_zero;
-    output [    INDEX_WIDTH-4:0] col_index_next;
-    output [    INDEX_WIDTH-4:0] col_index_prev;
-    
-
-    //
-    // Registers
-    //
-    reg  [INDEX_WIDTH-4:0] col_index_reg;
-    reg  [INDEX_WIDTH-4:0] col_index_last;
-    reg  [INDEX_WIDTH-4:0] col_index_dly;
-    
-    
-    //
-    // Mapping
-    //
-    assign col_index = col_index_reg;
-    assign col_index_prev = col_index_dly;
-    
-    
-    //
-    // Handy Wires
-    //
-    assign col_index_done = col_index == col_index_last;
-    assign col_index_zero = {(INDEX_WIDTH-3){1'b0}};
-    assign col_index_next = col_index + 1'b1;
- 
- 
-    //
-    // Increment Logic
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
-                col_index_reg  <= col_index_zero;
-                col_index_last <= index_last[INDEX_WIDTH-1:3];
-            end
-            //
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG:
-                col_index_reg <= col_index_next;
-            //
-        endcase
-      
-      
-    //
-    // Delay Logic
-    //
-    always @(posedge clk)
-        //
-        col_index_dly <= col_index;
-    
-    
-endmodule
diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v
deleted file mode 100644
index 565c7e0..0000000
--- a/rtl/modexpng_mmm_din_addr.v
+++ /dev/null
@@ -1,167 +0,0 @@
-module modexpng_mmm_din_addr
-(
-    clk, rst_n,
-    index_last,
-    fsm_state_next,
-    col_index_zero, col_index_next,
-    din_addr, din_bank, din_ena, din_reg_ena,
-    din_addr_cnt, din_addr_cnt_last,
-    din_addr_cnt_lower_prev, din_addr_cnt_upper_prev
-);
-
-
-    //
-    // Includes
-    //
-    `include "modexpng_parameters.vh"
-    //`include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-    
-    
-    //
-    // Ports
-    //
-    input                        clk;
-    input                        rst_n;
-    input  [    INDEX_WIDTH-1:0] index_last;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
-    input  [    INDEX_WIDTH-4:0] col_index_zero;
-    input  [    INDEX_WIDTH-4:0] col_index_next;
-    output [    INDEX_WIDTH-4:0] din_addr;
-    output [              3-1:0] din_bank;
-    output [              1-1:0] din_ena;
-    output [              1-1:0] din_reg_ena;
-    output [    INDEX_WIDTH-1:0] din_addr_cnt;
-    output [    INDEX_WIDTH-1:0] din_addr_cnt_last;
-    output [              3-1:0] din_addr_cnt_lower_prev;
-    output [    INDEX_WIDTH-4:0] din_addr_cnt_upper_prev;
-    
- 
-    //
-    // Address
-    //
-    reg  [INDEX_WIDTH-1:0] din_addr_reg;
-    wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}};
-    reg  [INDEX_WIDTH-1:0] din_addr_last;
-    wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1;
-    
-    reg  [INDEX_WIDTH-1:0] din_addr_cnt_reg;
-    wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}};
-    wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1;
-    reg  [INDEX_WIDTH-1:0] din_addr_cnt_last_reg;
-    wire [          3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[          3-1:0];
-    wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3];
-    reg  [          3-1:0] din_addr_cnt_lower_dly;
-    reg  [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly;
-
-    reg  [          3-1:0] din_bank_reg;
-
-
-    //
-    // Enables
-    //
-    reg din_ena_reg = 1'b0;
-    reg din_reg_ena_reg = 1'b0;
-    
-    always @(posedge clk or negedge rst_n)
-        //
-        if (!rst_n)
-            din_ena_reg <= 1'b0;
-        else case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
-                din_ena_reg <= 1'b1;
-            //
-            default:
-                din_ena_reg <= 1'b0;
-            //
-        endcase
-
-    always @(posedge clk or negedge rst_n)
-        //
-        if (!rst_n)
-            din_reg_ena_reg <= 1'b0;
-        else
-            din_reg_ena_reg <= din_ena_reg;
-
-
-    //
-    // Address Mapping
-    //
-    assign din_addr                = din_addr_reg[INDEX_WIDTH-1:3];
-
-    assign din_addr_cnt            = din_addr_cnt_reg;
-    assign din_addr_cnt_last       = din_addr_cnt_last_reg;
-    assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly;
-    assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly;
-
-    assign din_bank                = din_bank_reg;
-    
-    
-    //
-    // Enable Mapping
-    //
-    assign din_ena = din_ena_reg;
-    assign din_reg_ena = din_reg_ena_reg;
-
-
-    //
-    // Delay
-    //
-    always @(posedge clk) begin
-        din_addr_cnt_lower_dly <= din_addr_cnt_lower;
-        din_addr_cnt_upper_dly <= din_addr_cnt_upper;
-    end
-
-
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
-                din_addr_reg          <= {col_index_zero, {3{1'b0}}};
-                din_addr_last         <= index_last;
-                din_addr_cnt_reg      <= din_addr_cnt_zero;
-                din_addr_cnt_last_reg <= index_last;
-            end
-            //
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin
-                din_addr_reg     <= {col_index_next, {3{1'b0}}};
-                din_addr_cnt_reg <= din_addr_cnt_zero;
-            end
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
-                din_addr_reg     <= din_addr_prev;
-                din_addr_cnt_reg <= din_addr_cnt_next;
-            end
-            //
-            //default:
-            //
-        endcase
-
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
-                din_bank_reg = BANK_XY_T1T2;
-            //
-            default:
-                din_bank_reg = BANK_XY_ANY;
-            //
-        endcase
-        
-endmodule
diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v
deleted file mode 100644
index 3749d82..0000000
--- a/rtl/modexpng_mmm_dout_addr.v
+++ /dev/null
@@ -1,167 +0,0 @@
-module modexpng_mmm_dout_addr
-(
-    clk, rst_n,
-    //index_last,
-    fsm_state,
-    load_xy_addr,
-    load_addr_zero,
-    load_nn_coeff_addr_done,
-    /*
-    
-    col_index_zero, col_index_next,*/
-    x_dout_addr, y_dout_addr,
-    x_dout_ena,  y_dout_ena,
-    x_dout_bank, y_dout_bank
-
-);
-
-
-    //
-    // Includes
-    //
-    `include "modexpng_parameters.vh"
-    `include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-    
-    
-    //
-    // Ports
-    //
-    input                        clk;
-    input                        rst_n;
-    //input  [    INDEX_WIDTH-1:0] index_last;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state;
-    input  [INDEX_WIDTH:0] load_xy_addr;       // address
-    input                  load_addr_zero;
-    input                  load_nn_coeff_addr_done;
-    //input  [    INDEX_WIDTH-4:0] col_index_zero;
-    //input  [    INDEX_WIDTH-4:0] col_index_next;
-    output [INDEX_WIDTH-4:0] x_dout_addr;
-    output [INDEX_WIDTH-4:0] y_dout_addr;
-    
-    output [NUM_MULTS-1:0] x_dout_ena;
-    output [NUM_MULTS-1:0] y_dout_ena;
-    
-    output [3-1:0] x_dout_bank;
-    output [3-1:0] y_dout_bank;
-    
- 
-    //
-    // Registers
-    //
-    reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2
-    reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2
-    
-    reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}};
-    reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}};
-
-    reg [NUM_MULTS-1:0] x_dout_ena_int;
-    reg [NUM_MULTS-1:0] y_dout_ena_int;
-    
-    reg [3-1:0] x_dout_bank_reg;
-    reg [3-1:0] y_dout_bank_reg;
-
-    
-    //
-    // Mapping
-    //
-    assign x_dout_addr = x_dout_addr_reg;
-    assign y_dout_addr = y_dout_addr_reg;
-    
-    assign x_dout_ena  = x_dout_ena_reg;
-    assign y_dout_ena  = y_dout_ena_reg;
-    
-    assign x_dout_bank = x_dout_bank_reg;
-    assign y_dout_bank = y_dout_bank_reg;
-
-    
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3: begin
-                x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
-                y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
-            end
-            //
-            FSM_STATE_LOAD_NN_COEFF_3: begin
-                x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
-                y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
-            end
-            //
-            default: begin
-                x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
-                y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
-            end
-            //
-        endcase
-
-    wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1};        
-    
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_2: begin
-                x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]};
-                y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
-            end
-            //
-            FSM_STATE_LOAD_NN_COEFF_2: begin
-                x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done};
-                y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
-            end
-            //
-        endcase
-
-
-    always @(posedge clk or negedge rst_n)
-        //
-        if (!rst_n) begin
-            x_dout_ena_reg <= {NUM_MULTS{1'b0}};
-            y_dout_ena_reg <= {NUM_MULTS{1'b0}};        
-        end else case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3,
-            FSM_STATE_LOAD_NN_COEFF_3: begin
-                x_dout_ena_reg <= x_dout_ena_int;
-                y_dout_ena_reg <= y_dout_ena_int;
-            end
-            //
-            default: begin
-                x_dout_ena_reg <= {NUM_MULTS{1'b0}};
-                y_dout_ena_reg <= {NUM_MULTS{1'b0}};
-            end
-            //
-        endcase
-
-        
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3: begin
-                x_dout_bank_reg <= BANK_X_T1;
-                y_dout_bank_reg <= BANK_Y_T2;
-            end
-            //
-            FSM_STATE_LOAD_NN_COEFF_3: begin
-                x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N       : BANK_XY_AUX;
-                y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX;
-            end
-            //
-            default: begin
-                x_dout_bank_reg <= BANK_XY_ANY;
-                y_dout_bank_reg <= BANK_XY_ANY;
-            end
-            //
-        endcase
-
-        
-endmodule
diff --git a/rtl/modexpng_mmm_dual_x8.v b/rtl/modexpng_mmm_dual_x8.v
new file mode 100644
index 0000000..2e4f4e0
--- /dev/null
+++ b/rtl/modexpng_mmm_dual_x8.v
@@ -0,0 +1,951 @@
+module modexpng_mmm_dual_x8
+(
+    clk, rst,
+    
+    ena, rdy,
+
+        
+    ladder_mode,
+    word_index_last,
+    word_index_last_minus1,
+    
+    rd_wide_xy_ena,
+    rd_wide_xy_ena_aux,
+    rd_wide_xy_bank,
+    rd_wide_xy_bank_aux,
+    rd_wide_xy_addr,
+    rd_wide_xy_addr_aux,
+    rd_wide_x_dout,
+    rd_wide_y_dout,
+    rd_wide_x_dout_aux,
+    rd_wide_y_dout_aux,
+    
+    rd_narrow_xy_ena,
+    rd_narrow_xy_bank,
+    rd_narrow_xy_addr,
+    rd_narrow_x_dout,
+    rd_narrow_y_dout,
+    
+    rcmb_wide_xy_bank,
+    rcmb_wide_xy_addr,
+    rcmb_wide_x_dout,
+    rcmb_wide_y_dout,
+    rcmb_wide_xy_valid,
+    
+    rcmb_narrow_xy_bank,
+    rcmb_narrow_xy_addr,
+    rcmb_narrow_x_dout,
+    rcmb_narrow_y_dout,
+    rcmb_narrow_xy_valid,
+    
+    rcmb_xy_bank,
+    rcmb_xy_addr,
+    rcmb_x_dout,
+    rcmb_y_dout,
+    rcmb_xy_valid,
+    
+    rdct_ena
+);
+
+
+    //
+    // Headers
+    //
+    `include "../rtl_1/modexpng_mmm_fsm_old.vh"
+    `include "../rtl_1/modexpng_parameters_old.vh"
+    `include "../rtl_1/modexpng_parameters_x8_old.vh"
+
+
+    //
+    // Ports
+    //
+    input                        clk;
+    input                        rst;
+    
+    input                        ena;
+    output                       rdy;
+    
+    input                   ladder_mode;
+    input [7:0] word_index_last;
+    input [7:0] word_index_last_minus1;
+    
+    output                     rd_wide_xy_ena;
+    output                     rd_wide_xy_ena_aux;
+    output  [             1:0] rd_wide_xy_bank;
+    output  [             1:0] rd_wide_xy_bank_aux;
+    output  [ 8*NUM_MULTS/2-1:0] rd_wide_xy_addr;
+    output  [           8-1:0] rd_wide_xy_addr_aux;
+    input  [18*NUM_MULTS/2-1:0] rd_wide_x_dout;
+    input  [18*NUM_MULTS/2-1:0] rd_wide_y_dout;
+    input  [          18-1:0] rd_wide_x_dout_aux;
+    input  [          18-1:0] rd_wide_y_dout_aux;
+
+    output                    rd_narrow_xy_ena;
+    output [             1:0] rd_narrow_xy_bank;
+    output [ 7:0] rd_narrow_xy_addr;
+    input  [18-1:0] rd_narrow_x_dout;
+    input  [18-1:0] rd_narrow_y_dout;
+
+    output [ 1:0] rcmb_wide_xy_bank;
+    output [ 7:0] rcmb_wide_xy_addr;
+    output [17:0] rcmb_wide_x_dout;
+    output [17:0] rcmb_wide_y_dout;
+    output        rcmb_wide_xy_valid;
+
+    output [ 1:0] rcmb_narrow_xy_bank;
+    output [ 7:0] rcmb_narrow_xy_addr;
+    output [17:0] rcmb_narrow_x_dout;
+    output [17:0] rcmb_narrow_y_dout;
+    output        rcmb_narrow_xy_valid;
+
+    output [ 1:0] rcmb_xy_bank;
+    output [ 7:0] rcmb_xy_addr;
+    output [17:0] rcmb_x_dout;
+    output [17:0] rcmb_y_dout;
+    output        rcmb_xy_valid;
+    
+    output        rdct_ena;
+
+    
+    //
+    // FSM Declaration
+    //
+    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    
+    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_triangle;
+    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_rectangle;
+
+    
+    //
+    // FSM Process
+    //
+    always @(posedge clk)
+        //
+        if (rst) fsm_state <= FSM_STATE_IDLE;
+        else     fsm_state <= fsm_state_next;
+
+        
+    //
+    // Storage Control Interface
+    //
+    reg         wide_xy_ena = 1'b0;
+    reg         wide_xy_ena_aux = 1'b0;
+    reg  [ 1:0] wide_xy_bank;
+    reg  [ 1:0] wide_xy_bank_aux;
+    reg  [ 8-1:0] wide_xy_addr[0:3];
+    reg  [ 8-1:0] wide_xy_addr_aux;
+    
+    reg         narrow_xy_ena = 1'b0;
+    reg  [ 1:0] narrow_xy_bank;
+    reg  [ 7:0] narrow_xy_addr;
+    reg  [ 7:0] narrow_xy_addr_dly;
+    
+    assign rd_wide_xy_ena  = wide_xy_ena;
+    assign rd_wide_xy_ena_aux  = wide_xy_ena_aux;
+    assign rd_wide_xy_bank = wide_xy_bank;
+    assign rd_wide_xy_bank_aux = wide_xy_bank_aux;
+    assign rd_wide_xy_addr_aux = wide_xy_addr_aux;
+
+    assign rd_narrow_xy_ena  = narrow_xy_ena;
+    assign rd_narrow_xy_bank = narrow_xy_bank;
+    assign rd_narrow_xy_addr = narrow_xy_addr;
+
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        begin : gen_rd_wide_xy_addr
+            assign rd_wide_xy_addr[8*z+:8] = wide_xy_addr[z];
+        end
+    endgenerate
+        
+    //
+    // Column Counter
+    //
+    reg  [4:0] col_index;       // current column index
+    reg  [4:0] col_index_prev;  // delayed column index value
+    reg  [4:0] col_index_last;  // index of the very last column
+    reg  [4:0] col_index_next;  // precomputed next column index
+    reg        col_is_last;     // flag set during the very last column
+
+    always @(posedge clk)
+        //
+        col_index_prev <= col_index;
+
+    //
+    // Column Counter Increment Logic
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: begin
+                col_index       <= 5'd0;
+                col_index_last  <= word_index_last[7:3];
+                col_index_next  <= 5'd1;
+                col_is_last     <= 1'b0;
+                
+            end
+            //
+            FSM_STATE_MULT_SQUARE_COL_N_INIT,
+            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: begin
+                col_index <= col_index_next;
+                col_is_last <= col_index_next == col_index_last;
+                col_index_next <= col_index_next == col_index_last ? 5'd0 : col_index_next + 5'd1;   
+            end
+            //
+        endcase
+
+
+    //
+    // Completion Flags
+    //
+    wire square_almost_done_comb;
+    reg  square_almost_done_flop = 1'b0;
+    reg  square_surely_done_flop = 1'b0;
+
+    wire triangle_almost_done_comb;
+    reg  triangle_almost_done_flop = 1'b0;
+    reg  triangle_surely_done_flop = 1'b0;
+    reg  triangle_tardy_done_flop = 1'b0;
+
+    wire rectangle_almost_done_comb;
+    reg  rectangle_almost_done_flop = 1'b0;        
+    reg  rectangle_surely_done_flop = 1'b0;
+    reg  rectangle_tardy_done_flop = 1'b0;
+
+    assign square_almost_done_comb = narrow_xy_addr == word_index_last_minus1;
+    assign triangle_almost_done_comb = (narrow_xy_addr[2:0] == word_index_last_minus1[2:0]) && (narrow_xy_addr[7:3] == col_index);
+    assign rectangle_almost_done_comb = narrow_xy_addr == word_index_last_minus1;
+
+    //
+    // Square Completion Flags
+    //
+    always @(posedge clk) begin
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                square_almost_done_flop <= square_almost_done_comb;
+            //
+            default:
+               square_almost_done_flop <= 1'b0;
+           //
+        endcase
+        //
+        square_surely_done_flop <= square_almost_done_flop;
+        //
+    end
+
+    //
+    // Triangle Completion Flags
+    //
+    always @(posedge clk) begin
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:
+                triangle_almost_done_flop <= triangle_almost_done_comb;
+            //
+            default:
+                triangle_almost_done_flop <= 1'b0;
+            //
+        endcase
+        //
+        triangle_surely_done_flop <= triangle_almost_done_flop;
+        triangle_tardy_done_flop  <= triangle_surely_done_flop;
+        //
+    end
+      
+    //
+    // Rectangle Completion Flags
+    //
+    always @(posedge clk) begin
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:
+                rectangle_almost_done_flop <= rectangle_almost_done_comb;
+            //
+            default:
+                rectangle_almost_done_flop <= 1'b0;
+            //
+        endcase
+        //
+        rectangle_surely_done_flop <= rectangle_almost_done_flop;
+        rectangle_tardy_done_flop  <= rectangle_surely_done_flop;
+        //
+    end
+
+
+    //
+    // Narrow Storage Control Logic
+    //
+    always @(posedge clk)
+        //
+        if (rst) narrow_xy_ena <= 1'b0;
+        else begin
+            //
+            // Narrow Address
+            //
+            case (fsm_state_next)
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT:   narrow_xy_addr <= 8'd0;
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_addr <= !square_almost_done_flop ? narrow_xy_addr + 1'b1 : 8'd0;
+                //
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT: narrow_xy_addr <= 8'd0;
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: narrow_xy_addr <= triangle_almost_done_flop || (col_is_last && triangle_surely_done_flop) ?
+                    8'd0 :  narrow_xy_addr + 1'b1;
+                //
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT: narrow_xy_addr <= 8'd0;
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: narrow_xy_addr <= rectangle_almost_done_flop || rectangle_surely_done_flop ?
+                    8'd1 :  narrow_xy_addr + 1'b1;            
+                //
+                default:                            narrow_xy_addr <= 8'dX;
+                //
+            endcase
+            //
+            // Narrow Bank
+            //
+            case (fsm_state_next)
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT,
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_bank <= BANK_NARROW_T1T2;
+                //
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: narrow_xy_bank <= col_is_last && (triangle_almost_done_flop || triangle_surely_done_flop) ?
+                    BANK_NARROW_EXT : BANK_NARROW_N_COEFF;
+                //
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: narrow_xy_bank <= rectangle_almost_done_flop || rectangle_surely_done_flop ?
+                    BANK_NARROW_EXT : BANK_NARROW_Q;            
+                //
+                default:                            narrow_xy_bank <= 2'bXX;
+                //
+            endcase        
+            //
+            case (fsm_state_next)
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT,
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_ena <= ~square_almost_done_flop;
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   narrow_xy_ena <= !col_is_last ? ~triangle_almost_done_flop : ~triangle_surely_done_flop; 
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   narrow_xy_ena <= ~rectangle_surely_done_flop;
+                //
+                default:                              narrow_xy_ena <= 1'b0;
+                //
+            endcase
+            //
+        end
+
+
+    //
+    // Wide Storage Control Logic
+    //
+
+    wire [2:0] wide_offset_rom[0:3];
+    
+    generate for (z=1; z<NUM_MULTS; z=z+2)
+        begin : gen_wide_offset_rom
+            assign wide_offset_rom[(z-1)/2] = z[2:0];
+        end
+    endgenerate    
+
+    function  [7:0] wide_xy_addr_next;
+        input [7:0] wide_xy_addr_current;
+        input [7:0] wide_xy_addr_last;
+        begin
+            if (wide_xy_addr_current > 8'd0)
+                wide_xy_addr_next = wide_xy_addr_current - 1'b1;
+            else
+                wide_xy_addr_next = wide_xy_addr_last;
+        end
+    endfunction
+    
+    integer j;
+    always @(posedge clk)
+        //
+        if (rst) begin
+            wide_xy_ena <= 1'b0;
+            wide_xy_ena_aux <= 1'b0;
+        end else begin
+            //
+            // Wide Address
+            //        
+            for (j=0; j<(NUM_MULTS/2); j=j+1)
+                //
+                case (fsm_state_next)
+                    //
+                    // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
+                    //
+                    FSM_STATE_MULT_SQUARE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
+                    FSM_STATE_MULT_SQUARE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
+                    FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                    FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                    FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                    FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
+                    //
+                    FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
+                    FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
+                    FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                    FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                    FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                    FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
+                    //
+                    FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
+                    FSM_STATE_MULT_RECTANGLE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
+                    FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                    FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                    FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                    FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
+                    //
+                    default:                            wide_xy_addr[j] <= 8'dX;
+                endcase
+            //
+            // Wide Aux Address
+            //
+            case (fsm_state_next)
+                //
+                // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
+                FSM_STATE_MULT_SQUARE_COL_N_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_addr_aux <= wide_xy_addr_next(wide_xy_addr_aux, word_index_last);
+                //
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_addr_aux <= wide_xy_addr_next(wide_xy_addr_aux, word_index_last);
+                //
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   wide_xy_addr_aux <= 8'dX;//{5'd0, 3'd0};
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
+                FSM_STATE_MULT_RECTANGLE_HOLDOFF:    wide_xy_addr_aux <= rcmb_xy_valid ? rcmb_xy_addr : 8'dX;
+                //recomb_fat_bram_xy_dout_valid && (recomb_fat_bram_xy_bank == BANK_FAT_ML) ?
+                    //mac_fat_bram_xy_addr[4] + 1'b1 : mac_fat_bram_xy_addr[4];
+                //
+                default:                            wide_xy_addr_aux <= 8'dX;
+            endcase
+            //
+            // Wide Bank
+            //
+            case (fsm_state_next)
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT,
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:    wide_xy_bank <= BANK_WIDE_T1T2;
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:  wide_xy_bank <= BANK_WIDE_ABL;
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_bank <= BANK_WIDE_ABL;
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_bank <= BANK_WIDE_N;            
+                default:                             wide_xy_bank <= 3'bXXX;
+            endcase
+            //
+            // Wide Aux Bank
+            //
+            case (fsm_state_next)
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT,
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_bank_aux <= BANK_WIDE_T1T2;
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: wide_xy_bank_aux <= BANK_WIDE_ABH;
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_bank_aux <= BANK_WIDE_ABL;
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
+                FSM_STATE_MULT_RECTANGLE_HOLDOFF:   if (rcmb_xy_valid) // rewrite using "Kolya-style" here (get rid of too many xxx's)
+                    case (rcmb_xy_bank)
+                        BANK_RCMB_ML: wide_xy_bank_aux <= BANK_WIDE_ABL;
+                        BANK_RCMB_MH: wide_xy_bank_aux <= BANK_WIDE_ABH;
+                        //BANK_RDCT_EXT: wide_xy_bank_aux <= BANK_WIDE_EXT; '3bXXX
+                        default: wide_xy_bank_aux <= 3'bXXX; 
+                     endcase
+                     else wide_xy_bank_aux <= 3'bXXX;
+                default:                            wide_xy_bank_aux <= 3'bXXX;
+            endcase
+            //
+            // Wide Enable
+            //
+            case (fsm_state_next)
+                FSM_STATE_MULT_SQUARE_COL_0_INIT,
+                FSM_STATE_MULT_SQUARE_COL_N_INIT,
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_ena <= 1'b1;
+                default:                               wide_xy_ena <= 1'b0;
+            endcase
+            //
+            // Wide Aux Enable
+            //
+            case (fsm_state_next)
+                FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
+                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:  wide_xy_ena_aux <= 1'b1;
+                FSM_STATE_MULT_RECTANGLE_COL_0_INIT: wide_xy_ena_aux <= 1'b0;//1'b1;
+                FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
+                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
+                FSM_STATE_MULT_RECTANGLE_HOLDOFF:   wide_xy_ena_aux <= rcmb_xy_valid;// && (recomb_fat_bram_xy_bank == BANK_FAT_ML);
+                default:                            wide_xy_ena_aux <= 1'b0;
+            endcase
+            //
+        end
+        
+        
+    //
+    // Delay Lines
+    //
+    always @(posedge clk)
+        //
+        narrow_xy_addr_dly <= narrow_xy_addr;
+
+    
+    //
+    // DSP Array Logic
+    //
+    reg             dsp_xy_ce_a = 1'b0;
+    reg             dsp_xy_ce_b = 1'b0;
+    reg             dsp_xy_ce_b_dly = 1'b0;
+    reg             dsp_xy_ce_m = 1'b0;
+    reg             dsp_xy_ce_p = 1'b0;
+    reg             dsp_xy_ce_mode = 1'b0;
+    
+    reg  [9   -1:0] dsp_xy_mode_z = {9{1'b1}};
+    
+    wire [5*18-1:0] dsp_x_a;
+    wire [5*18-1:0] dsp_y_a;
+
+    reg  [1*16-1:0] dsp_x_b;
+    reg  [1*16-1:0] dsp_y_b;
+    
+    reg  [ 1:0] dsp_xy_b_carry;
+
+    wire [9*47-1:0] dsp_x_p;            
+    wire [9*47-1:0] dsp_y_p;
+        
+    //generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        //begin : gen_dsp_xy_a_split
+            //assign dsp_x_a[18*z+:18] = rd_wide_x_dout[z];
+            //assign dsp_y_a[18*z+:18] = rd_wide_y_dout[z];
+        //end
+    //endgenerate
+    
+    assign dsp_x_a = {rd_wide_x_dout_aux, rd_wide_x_dout};
+    assign dsp_y_a = {rd_wide_y_dout_aux, rd_wide_y_dout};
+    
+    //assign dsp_x_a[18*4+:18] = rd_wide_x_dout_aux;
+    //assign dsp_y_a[18*4+:18] = rd_wide_y_dout_aux;
+            
+    always @(posedge clk)
+        //
+        dsp_xy_ce_b_dly <= dsp_xy_ce_b;
+    
+
+    modexpng_dsp_array_block dsp_array_block_x
+    (
+        .clk            (clk),
+        
+        .ce_a           (dsp_xy_ce_a),
+        .ce_b           (dsp_xy_ce_b),
+        .ce_m           (dsp_xy_ce_m),
+        .ce_p           (dsp_xy_ce_p),
+        .ce_mode        (dsp_xy_ce_mode),
+
+        .mode_z         (dsp_xy_mode_z),
+        
+        .a              (dsp_x_a),
+        .b              (dsp_x_b),
+        .p              (dsp_x_p)
+    );
+
+    modexpng_dsp_array_block dsp_array_block_y
+    (
+        .clk            (clk),
+        
+        .ce_a           (dsp_xy_ce_a),
+        .ce_b           (dsp_xy_ce_b),
+        .ce_m           (dsp_xy_ce_m),
+        .ce_p           (dsp_xy_ce_p),
+        .ce_mode        (dsp_xy_ce_mode),
+
+        .mode_z         (dsp_xy_mode_z),
+        
+        .a              (dsp_y_a),
+        .b              (dsp_y_b),
+        .p              (dsp_y_p)
+    );
+
+
+   
+
+    //
+    // DSP Control Logic
+    //
+    reg narrow_xy_ena_dly1 = 1'b0;
+    reg narrow_xy_ena_dly2 = 1'b0;
+    
+    always @(posedge clk)
+        //
+        if (rst) begin
+            //
+            narrow_xy_ena_dly1 <= 1'b0;
+            narrow_xy_ena_dly2 <= 1'b0;
+            //
+            dsp_xy_ce_a    <= 1'b0;
+            dsp_xy_ce_b    <= 1'b0;
+            dsp_xy_ce_m    <= 1'b0;
+            dsp_xy_ce_p    <= 1'b0;
+            dsp_xy_ce_mode <= 1'b0;
+            //
+        end else begin
+            //
+            narrow_xy_ena_dly1 <= narrow_xy_ena;
+            narrow_xy_ena_dly2 <= narrow_xy_ena_dly1; 
+            //
+            dsp_xy_ce_a    <= narrow_xy_ena_dly1 | narrow_xy_ena_dly2;
+            dsp_xy_ce_b    <= narrow_xy_ena_dly2;
+            dsp_xy_ce_m    <= dsp_xy_ce_b_dly;
+            dsp_xy_ce_p    <= dsp_xy_ce_m;
+            dsp_xy_ce_mode <= dsp_xy_ce_b_dly;
+            //
+        end    
+        
+    //
+    // DSP Feed Logic
+    //
+    reg dsp_merge_xy_b;
+    
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG:   dsp_merge_xy_b <= 1'b1;
+            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: dsp_merge_xy_b <= 1'b0;
+        endcase
+
+    //
+    // On-the-fly Carry Recombination
+    //
+    wire [17:0] rd_narrow_x_dout_carry = rd_narrow_x_dout + {{16{1'b0}}, dsp_xy_b_carry};
+    wire [17:0] rd_narrow_y_dout_carry = rd_narrow_y_dout + {{16{1'b0}}, dsp_xy_b_carry};
+    wire [17:0] rd_narrow_xy_dout_carry_mux = ladder_mode ? rd_narrow_y_dout_carry : rd_narrow_x_dout_carry;
+  
+    always @(posedge clk)
+        //
+        if (narrow_xy_ena_dly2) begin // rewrite
+            //
+            if (!dsp_merge_xy_b) begin
+                dsp_x_b <= rd_narrow_x_dout[15:0];
+                dsp_y_b <= rd_narrow_y_dout[15:0];
+                dsp_xy_b_carry <= 2'b00;
+            end else begin
+                dsp_x_b <= rd_narrow_xy_dout_carry_mux[15:0];
+                dsp_y_b <= rd_narrow_xy_dout_carry_mux[15:0];
+                dsp_xy_b_carry <= rd_narrow_xy_dout_carry_mux[17:16];
+            end                 
+            //
+        end else begin
+            //
+            dsp_x_b <= {16{1'bX}};
+            dsp_y_b <= {16{1'bX}};
+            //
+            dsp_xy_b_carry <= 2'b00;
+            //
+        end
+
+        
+    reg  [9   -1:0] dsp_xy_mode_z_adv1 = {9{1'b1}};
+    reg  [9   -1:0] dsp_xy_mode_z_adv2 = {9{1'b1}};
+    reg  [9   -1:0] dsp_xy_mode_z_adv3 = {9{1'b1}};
+    reg  [9   -1:0] dsp_xy_mode_z_adv4 = {9{1'b1}};
+        
+         function  [NUM_MULTS:0] calc_mac_mode_z_square;
+        input [        4:0] col_index_value;
+        input [        7:0] narrow_xy_addr_value;
+        begin
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                case (narrow_xy_addr_value[2:0])
+                    3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110};
+                    3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101};
+                    3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011};
+                    3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111};
+                    3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111};
+                    3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111};
+                    3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111};
+                    3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111};
+                endcase
+            else
+                calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}};
+        end
+    endfunction
+    
+    function  [NUM_MULTS:0] calc_mac_mode_z_rectangle;
+        input [        4:0] col_index_value;
+        input [        7:0] narrow_xy_addr_value;
+        begin
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                case (narrow_xy_addr_value[2:0])
+                    3'b000: calc_mac_mode_z_rectangle = {1'b1, 8'b11111110};
+                    3'b001: calc_mac_mode_z_rectangle = {1'b1, 8'b11111101};
+                    3'b010: calc_mac_mode_z_rectangle = {1'b1, 8'b11111011};
+                    3'b011: calc_mac_mode_z_rectangle = {1'b1, 8'b11110111};
+                    3'b100: calc_mac_mode_z_rectangle = {1'b1, 8'b11101111};
+                    3'b101: calc_mac_mode_z_rectangle = {1'b1, 8'b11011111};
+                    3'b110: calc_mac_mode_z_rectangle = {1'b1, 8'b10111111};
+                    3'b111: calc_mac_mode_z_rectangle = {1'b1, 8'b01111111};
+                endcase
+            else
+                calc_mac_mode_z_rectangle = {1'b1, {NUM_MULTS{1'b1}}};
+        end
+    endfunction
+        
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:       dsp_xy_mode_z_adv4 <= {9{1'b0}};
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:       dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, narrow_xy_addr_dly);
+            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
+            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= {9{1'b1}};
+            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
+            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= calc_mac_mode_z_rectangle(col_index_prev, narrow_xy_addr_dly);
+            default:                                dsp_xy_mode_z_adv4 <= {9{1'b1}};
+        endcase
+
+    always @(posedge clk) begin
+        dsp_xy_mode_z <= dsp_xy_mode_z_adv1;
+        //
+        dsp_xy_mode_z_adv1 <= dsp_xy_mode_z_adv2;
+        dsp_xy_mode_z_adv2 <= dsp_xy_mode_z_adv3;
+        dsp_xy_mode_z_adv3 <= dsp_xy_mode_z_adv4;
+    end
+        
+
+
+    
+    
+    //
+    // Recombinator
+    //
+    reg  rcmb_ena = 1'b0;
+    wire rcmb_rdy;
+
+    modexpng_recombinator_block recombinator_block
+    (
+        .clk                            (clk),
+        
+        .ena                            (rcmb_ena),
+        .rdy                            (rcmb_rdy),
+        
+        .fsm_state_next                 (fsm_state_next),
+        
+        .word_index_last                (word_index_last),
+        
+        .dsp_xy_ce_p                    (dsp_xy_ce_p),
+        .dsp_x_p                        (dsp_x_p),
+        .dsp_y_p                        (dsp_y_p),
+        
+        .col_index                      (col_index),
+        .col_index_last                 (col_index_last),
+        
+        .rd_narrow_xy_addr                 (narrow_xy_addr),
+        .rd_narrow_xy_bank                 (narrow_xy_bank),
+        
+        .rcmb_wide_xy_bank          (rcmb_wide_xy_bank),
+        .rcmb_wide_xy_addr          (rcmb_wide_xy_addr),
+        .rcmb_wide_x_dout           (rcmb_wide_x_dout),
+        .rcmb_wide_y_dout           (rcmb_wide_y_dout),
+        .rcmb_wide_xy_valid         (rcmb_wide_xy_valid),
+        
+        .rcmb_narrow_xy_bank        (rcmb_narrow_xy_bank),
+        .rcmb_narrow_xy_addr        (rcmb_narrow_xy_addr),
+        .rcmb_narrow_x_dout         (rcmb_narrow_x_dout),
+        .rcmb_narrow_y_dout         (rcmb_narrow_y_dout),
+        .rcmb_narrow_xy_valid       (rcmb_narrow_xy_valid),
+        
+        .rdct_narrow_xy_bank        (rcmb_xy_bank),
+        .rdct_narrow_xy_addr        (rcmb_xy_addr),
+        .rdct_narrow_x_dout         (rcmb_x_dout),
+        .rdct_narrow_y_dout         (rcmb_y_dout),
+        .rdct_narrow_xy_valid       (rcmb_xy_valid)
+
+    );
+    
+    
+    //
+    // Recombinator Enable Logic
+    //    
+    always @(posedge clk)
+        //
+        if (rst) rcmb_ena <= 1'b0;
+        else     rcmb_ena <= dsp_xy_ce_a && !dsp_xy_ce_b && !dsp_xy_ce_m && !dsp_xy_ce_p;
+
+        
+    //
+    // Handy Completion Flags
+    //    
+    wire square_done    = square_surely_done_flop;
+    wire triangle_done  = !col_is_last ? triangle_surely_done_flop : triangle_tardy_done_flop;
+    wire rectangle_done = rectangle_tardy_done_flop;
+    
+
+    //
+    // FSM Transition Logic
+    //
+    assign fsm_state_after_mult_square    = col_is_last ? FSM_STATE_MULT_SQUARE_HOLDOFF   : FSM_STATE_MULT_SQUARE_COL_N_INIT;
+    assign fsm_state_after_mult_triangle  = col_is_last ? FSM_STATE_MULT_TRIANGLE_HOLDOFF : FSM_STATE_MULT_TRIANGLE_COL_N_INIT;
+    assign fsm_state_after_mult_rectangle = col_is_last ? FSM_STATE_MULT_RECTANGLE_HOLDOFF : FSM_STATE_MULT_RECTANGLE_COL_N_INIT;
+
+    always @* begin
+        //
+        fsm_state_next = FSM_STATE_IDLE;
+        //
+        case (fsm_state)
+            FSM_STATE_IDLE:                   fsm_state_next = ena                   ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
+                        
+            FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = square_done ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+            
+            FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = square_done ? fsm_state_after_mult_square    : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+            
+            FSM_STATE_MULT_SQUARE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? FSM_STATE_MULT_TRIANGLE_COL_0_INIT : FSM_STATE_MULT_SQUARE_HOLDOFF;
+
+            FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ;
+            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ;
+            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: fsm_state_next = triangle_done ? FSM_STATE_MULT_TRIANGLE_COL_N_INIT : FSM_STATE_MULT_TRIANGLE_COL_0_BUSY;     
+            
+            FSM_STATE_MULT_TRIANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_TRIG ;
+            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_BUSY ;
+            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: fsm_state_next = triangle_done ? fsm_state_after_mult_triangle : FSM_STATE_MULT_TRIANGLE_COL_N_BUSY;
+            
+            FSM_STATE_MULT_TRIANGLE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? FSM_STATE_MULT_RECTANGLE_COL_0_INIT : FSM_STATE_MULT_TRIANGLE_HOLDOFF;
+
+            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_TRIG ;
+            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_BUSY ;
+            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY: fsm_state_next = rectangle_done ? FSM_STATE_MULT_RECTANGLE_COL_N_INIT : FSM_STATE_MULT_RECTANGLE_COL_0_BUSY;     
+            
+            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_TRIG ;
+            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_BUSY ;
+            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: fsm_state_next = rectangle_done ? fsm_state_after_mult_rectangle : FSM_STATE_MULT_RECTANGLE_COL_N_BUSY;
+            
+            FSM_STATE_MULT_RECTANGLE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? FSM_STATE_STOP : FSM_STATE_MULT_RECTANGLE_HOLDOFF;
+            
+            default:                          fsm_state_next =                         FSM_STATE_IDLE                   ;
+
+        endcase
+        //
+    end
+
+
+    //
+    // Reductor Control Logic
+    //
+    reg rdct_ena_reg = 1'b0;
+
+    assign rdct_ena = rdct_ena_reg; 
+    
+    always @(posedge clk) // add reset!!!
+        //
+        case (fsm_state)
+           FSM_STATE_MULT_RECTANGLE_COL_0_INIT: rdct_ena_reg <= 1'b1;
+           default:                             rdct_ena_reg <= 1'b0;
+        endcase
+    
+    
+    
+endmodule
diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm_old.vh
similarity index 100%
rename from rtl/modexpng_mmm_fsm.vh
rename to rtl/modexpng_mmm_fsm_old.vh
diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v
deleted file mode 100644
index a2a21ff..0000000
--- a/rtl/modexpng_mmm_pad.v
+++ /dev/null
@@ -1,153 +0,0 @@
-module modexpng_mmm_pad
-(
-    clk, rst_n,
-    fsm_state,
-    load_xy_addr_lsb,
-    pad_x_rd_addr, pad_y_rd_addr,
-    pad_x_rd_ena,  pad_y_rd_ena,
-    pad_x_rd_dout, pad_y_rd_dout,
-    load_x_din,    load_y_din
-);
-
-
-    //
-    // Includes
-    //
-    `include "modexpng_parameters.vh"
-    //`include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-    
-    
-    //
-    // Ports
-    //
-    input                        clk;
-    input                        rst_n;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state;
-
-    input [INDEX_WIDTH-1:0] load_xy_addr_lsb;
-    
-    input [WORD_WIDTH-1:0] load_x_din;
-    input [WORD_WIDTH-1:0] load_y_din;
-    
-    input [INDEX_WIDTH-1:0] pad_x_rd_addr;
-    input [INDEX_WIDTH-1:0] pad_y_rd_addr;
-    
-    input                   pad_x_rd_ena;
-    input                   pad_y_rd_ena;
-    
-    output [WORD_WIDTH-1:0] pad_x_rd_dout;
-    output [WORD_WIDTH-1:0] pad_y_rd_dout;
-    
- 
-    //
-    // Registers
-    //
-    reg [INDEX_WIDTH-1:0] pad_x_wr_addr;
-    reg [INDEX_WIDTH-1:0] pad_y_wr_addr;
-    reg                   pad_x_wr_ena;
-    reg                   pad_y_wr_ena;
-    reg [ WORD_WIDTH-1:0] pad_x_wr_din;
-    reg [ WORD_WIDTH-1:0] pad_y_wr_din;
-        
-    bram_1wo_1ro_readfirst_ce #
-    (
-        .MEM_WIDTH      (WORD_WIDTH),
-        .MEM_ADDR_BITS  (INDEX_WIDTH)
-    )
-    pad_x
-    (
-        .clk        (clk),
-
-        .a_addr     (pad_x_wr_addr),
-        .a_en       (pad_x_wr_ena),
-        .a_wr       (pad_x_wr_ena),
-        .a_in       (pad_x_wr_din),
-        .a_out      (), // unused
-
-        .b_addr     (pad_x_rd_addr),
-        .b_en       (pad_x_rd_ena),
-        .b_out      (pad_x_rd_dout)
-    );    
-
-    bram_1wo_1ro_readfirst_ce #
-    (
-        .MEM_WIDTH      (WORD_WIDTH),
-        .MEM_ADDR_BITS  (INDEX_WIDTH)
-    )
-    pad_y
-    (
-        .clk        (clk),
-
-        .a_addr     (pad_y_wr_addr),
-        .a_en       (pad_y_wr_ena),
-        .a_wr       (pad_y_wr_ena),
-        .a_in       (pad_y_wr_din),
-        .a_out      (), // unused
-
-        .b_addr     (pad_y_rd_addr),
-        .b_en       (pad_y_rd_ena),
-        .b_out      (pad_y_rd_dout)
-    );    
-  
-
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3: begin
-                pad_x_wr_addr <= load_xy_addr_lsb;
-                pad_y_wr_addr <= load_xy_addr_lsb;
-            end
-            //
-            default: begin
-                pad_x_wr_addr <= {INDEX_WIDTH{1'bX}};
-                pad_y_wr_addr <= {INDEX_WIDTH{1'bX}};
-            end
-            //
-        endcase
-
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3: begin
-                pad_x_wr_din <= load_x_din;
-                pad_y_wr_din <= load_y_din;
-            end
-            //
-            default: begin
-                pad_x_wr_din <= load_x_din;
-                pad_y_wr_din <= load_y_din;
-            end
-            //
-        endcase
-        
-  
-    always @(posedge clk or negedge rst_n)
-        //
-        if (!rst_n) begin
-            pad_x_wr_ena <= 1'b0;
-            pad_y_wr_ena <= 1'b0;
-        end else case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3: begin
-                pad_x_wr_ena <= 1'b1;
-                pad_y_wr_ena <= 1'b1;
-            end
-            //
-            default: begin
-                pad_x_wr_ena <= 1'b0;
-                pad_y_wr_ena <= 1'b0;
-            end
-            //
-        endcase
-
-
-endmodule
diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v
deleted file mode 100644
index a8f309a..0000000
--- a/rtl/modexpng_mmm_transporter.v
+++ /dev/null
@@ -1,157 +0,0 @@
-module modexpng_mmm_transporter
-(
-    clk,
-    ena,
-    index_last,
-    fsm_state,
-    fsm_state_next,
-    load_phase,
-    load_xy_addr,
-    load_xy_addr_vld,
-    load_xy_req,
-    load_addr_zero,
-    load_t1t2_addr_done,
-    load_nn_coeff_addr_done
-);
-
-
-    //
-    // Includes
-    //
-    //`include "modexpng_parameters.vh"
-    //`include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-
-
-    //
-    // Ports
-    //
-    input                        clk;
-    input                        ena;
-    input  [    INDEX_WIDTH-1:0] index_last;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
-    output                       load_phase;
-    output [      INDEX_WIDTH:0] load_xy_addr;
-    output                       load_xy_addr_vld;
-    output                       load_xy_req;
-    output                       load_addr_zero;
-    output                       load_t1t2_addr_done;
-    output                       load_nn_coeff_addr_done;
-    
-
-    //
-    // Load Address Generator
-    //
-    reg                 load_phase_reg;
-    reg [INDEX_WIDTH:0] load_xy_addr_reg;
-    reg                 load_xy_addr_vld_reg;
-    reg                 load_xy_req_reg;
-
-    
-    //
-    // Mapping
-    //
-    assign load_phase       = load_phase_reg;
-    assign load_xy_addr     = load_xy_addr_reg;
-    assign load_xy_addr_vld = load_xy_addr_vld_reg;
-    assign load_xy_req      = load_xy_req_reg;
-
-    
-    //
-    // Handy Quantities
-    //
-    wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0};
-    wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1;
-    wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX};
-    
-    
-    //
-    // More Handy Quantities
-    //
-    reg [INDEX_WIDTH:0] load_t1t2_addr_last;
-    reg [INDEX_WIDTH:0] load_nn_coeff_addr_last;
-
-    
-    //
-    // Flags
-    //
-    assign load_addr_zero          = load_xy_addr_reg == load_xy_addr_zero;
-    assign load_t1t2_addr_done     = load_xy_addr_reg == load_t1t2_addr_last;
-    assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last;
-    
-    
-    //
-    // Last Index Latch
-    //
-    always @(posedge clk)
-        //
-        if (ena && (fsm_state == FSM_STATE_IDLE)) begin
-            load_t1t2_addr_last     <= {1'b0, index_last};
-            load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1;
-        end
-    
-
-    //
-    // Update Load Phase
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_LOAD_T1T2_1,
-            FSM_STATE_LOAD_T1T2_2,
-            FSM_STATE_LOAD_T1T2_3:      load_phase_reg <= 1'b0;
-            FSM_STATE_LOAD_NN_COEFF_1,
-            FSM_STATE_LOAD_NN_COEFF_2,
-            FSM_STATE_LOAD_NN_COEFF_3:  load_phase_reg <= 1'b1;
-            default:                    load_phase_reg <= 1'bX;
-        endcase
-    
-    
-    //
-    // Update Load Address
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_LOAD_T1T2_1:     load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero;
-            FSM_STATE_LOAD_T1T2_2,
-            FSM_STATE_LOAD_T1T2_3:     load_xy_addr_reg <= load_xy_addr_reg;
-            FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero;
-            FSM_STATE_LOAD_NN_COEFF_2,
-            FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg;
-            default                    load_xy_addr_reg <= load_xy_addr_xxx;
-        endcase
-
-    
-    //
-    // Update Address Valid Flag
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_LOAD_T1T2_1,
-            FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1;
-            default                    load_xy_addr_vld_reg <= 1'b0;
-        endcase
-
-        
-    //
-    // Update Load Request Flag
-    //
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_LOAD_T1T2_2,
-            FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1;
-            default                    load_xy_req_reg <= 1'b0;
-        endcase
-
- 
-endmodule
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v
deleted file mode 100644
index 99a37fa..0000000
--- a/rtl/modexpng_mmm_x8_dual.v
+++ /dev/null
@@ -1,550 +0,0 @@
-module modexpng_mmm_x8_dual
-(
-    clk, rst_n,
-    ena, rdy,
-    mode, transfer,
-    index_last,
-    x_din,      y_din,      x_dout,      y_dout,
-    x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
-    x_din_ena,  y_din_ena,  x_dout_ena,  y_dout_ena, x_din_reg_ena, y_din_reg_ena,
-    x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
-    load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
-    load_x_din, load_y_din
-);
-
-
-    //
-    // Includes
-    //
-    `include "modexpng_parameters.vh"
-    `include "modexpng_parameters_x8.vh"
-    `include "modexpng_mmm_fsm.vh"
-
-
-    //
-    // Parameters
-    //
-    parameter INDEX_WIDTH = 6;
-    
-    
-    //
-    // Ports
-    //
-    input  clk;
-    input  rst_n;
-    
-    input  ena;
-    output rdy;
-        
-    input  mode;        // multiply:    0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
-                        // load/unload: 0 = load, 1 = unload
-    input  transfer;    // 0 = multiply, 1 = load/unload
-        
-    input  [INDEX_WIDTH-1:0] index_last;
-
-    input  [NUM_MULTS*WORD_WIDTH-1:0] x_din;
-    input  [NUM_MULTS*WORD_WIDTH-1:0] y_din;
-    output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
-    output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
-    
-    output [INDEX_WIDTH-4:0] x_din_addr;
-    output [INDEX_WIDTH-4:0] y_din_addr;
-    output [INDEX_WIDTH-4:0] x_dout_addr;
-    output [INDEX_WIDTH-4:0] y_dout_addr;
-    
-    output [        1-1:0] x_din_ena;
-    output [        1-1:0] y_din_ena;
-    output [NUM_MULTS-1:0] x_dout_ena;
-    output [NUM_MULTS-1:0] y_dout_ena;
-    output [        1-1:0] x_din_reg_ena;
-    output [        1-1:0] y_din_reg_ena;
-    
-    output [3-1:0] x_din_bank;
-    output [3-1:0] y_din_bank;
-    output [3-1:0] x_dout_bank;
-    output [3-1:0] y_dout_bank;
-    
-    output                  load_phase;         // 0 = T1, T2; 1 = N, N_COEFF
-    output [ INDEX_WIDTH:0] load_xy_addr;       // address
-    output                  load_xy_addr_vld;   // address valid
-    output                  load_xy_req;        // data request
-    
-    input  [WORD_WIDTH-1:0] load_x_din;         // data input
-    input  [WORD_WIDTH-1:0] load_y_din;         // data input
-
-
-    //
-    // FSM State and Next States
-    //
-    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
-    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
-    reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
-    reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
-    
-
-    //
-    // FSM Idle Next State
-    //
-    always @*
-        //
-        case ({transfer, mode})
-            2'b00,
-            2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
-            2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
-            2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
-        endcase
-
-
-    //
-    // Column Counter
-    //
-    wire [    INDEX_WIDTH-4:0] col_index;
-    wire                       col_index_done;
-    wire [    INDEX_WIDTH-4:0] col_index_zero;
-    wire [    INDEX_WIDTH-4:0] col_index_next;
-    wire [    INDEX_WIDTH-4:0] col_index_prev;
-
-    modexpng_mmm_col_index #
-    (
-        .INDEX_WIDTH(INDEX_WIDTH)
-    )
-    mmm_col_index
-    (
-        .clk            (clk),
-        .index_last     (index_last),
-        .fsm_state_next (fsm_state_next),
-        .col_index      (col_index),
-        .col_index_done (col_index_done),
-        .col_index_zero (col_index_zero),
-        .col_index_next (col_index_next),
-        .col_index_prev (col_index_prev)
-    );
-
-
-    //
-    // Load Address Generator
-    //
-    wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
-    wire load_addr_zero;
-    wire load_t1t2_addr_done;
-    wire load_nn_coeff_addr_done;
-
-    modexpng_mmm_transporter #
-    (
-        .INDEX_WIDTH(INDEX_WIDTH)
-    )
-    transporter
-    (
-        .clk                        (clk),
-        .ena                        (ena),
-        .index_last                 (index_last),
-        .fsm_state                  (fsm_state),
-        .fsm_state_next             (fsm_state_next),
-        .load_phase                 (load_phase),
-        .load_xy_addr               (load_xy_addr),
-        .load_xy_addr_vld           (load_xy_addr_vld),
-        .load_xy_req                (load_xy_req),
-        .load_addr_zero             (load_addr_zero),    
-        .load_t1t2_addr_done        (load_t1t2_addr_done),
-        .load_nn_coeff_addr_done    (load_nn_coeff_addr_done)
-    );
- 
- 
-    //
-    // X, Y Address
-    //
-    wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
-    wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
-    wire [          3-1:0] x_din_addr_cnt_lower_prev;
-    wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
-
-    modexpng_mmm_din_addr #
-    (
-        .INDEX_WIDTH(INDEX_WIDTH)
-    )
-    din_addr_x
-    (
-        .clk                        (clk),
-        .rst_n                      (rst_n),
-        .index_last                 (index_last),
-        .fsm_state_next             (fsm_state_next),
-        .col_index_zero             (col_index_zero),
-        .col_index_next             (col_index_next),
-        .din_addr                   (x_din_addr),
-        .din_bank                   (x_din_bank),
-        .din_ena                    (x_din_ena),
-        .din_reg_ena                (x_din_reg_ena),
-        .din_addr_cnt               (x_din_addr_cnt),
-        .din_addr_cnt_last          (x_din_addr_cnt_last),
-        .din_addr_cnt_lower_prev    (x_din_addr_cnt_lower_prev),
-        .din_addr_cnt_upper_prev    (x_din_addr_cnt_upper_prev)
-    );
-    
-    modexpng_mmm_dout_addr #
-    (
-        .INDEX_WIDTH(INDEX_WIDTH)
-    )
-    dout_addr_xy
-    (
-        .clk                        (clk),
-        .rst_n                      (rst_n),
-        .fsm_state                  (fsm_state),
-        .load_xy_addr               (load_xy_addr),
-        .load_addr_zero             (load_addr_zero),
-        .load_nn_coeff_addr_done    (load_nn_coeff_addr_done),
-        .x_dout_addr                (x_dout_addr),
-        .y_dout_addr                (y_dout_addr),
-        .x_dout_ena                 (x_dout_ena),
-        .y_dout_ena                 (y_dout_ena),
-        .x_dout_bank                (x_dout_bank),
-        .y_dout_bank                (y_dout_bank)
-    );          
-  
-
-    //
-    // Helper Memories ("Scratchpad")
-    //    
-    reg  [INDEX_WIDTH-1:0] pad_xy_rd_addr;
-    reg                    pad_xy_rd_ena = 1'b0;
-    wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
-    wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
-    
-    wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
-    wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
-    
-    modexpng_mmm_pad pad
-    (
-        .clk                (clk),
-        .rst_n              (rst_n),
-        .fsm_state          (fsm_state),
-        .load_xy_addr_lsb   (load_xy_addr_lsb),
-        .load_x_din         (load_x_din),
-        .load_y_din         (load_y_din),
-        .pad_x_rd_addr      (pad_xy_rd_addr),
-        .pad_y_rd_addr      (pad_xy_rd_addr),
-        .pad_x_rd_ena       (pad_xy_rd_ena),
-        .pad_y_rd_ena       (pad_xy_rd_ena),
-        .pad_x_rd_dout      (pad_x_rd_dout),
-        .pad_y_rd_dout      (pad_y_rd_dout)
-    );
-    
-    
-    always @(posedge clk or negedge rst_n)
-        //
-        if (!rst_n) begin
-            pad_xy_rd_ena <= 1'b0;
-        end else case (fsm_state_next)
-        
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
-                pad_xy_rd_ena <= 1'b1;
-                
-            default:
-                pad_xy_rd_ena <= 1'b0;
-        
-        endcase
-        
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG:
-                pad_xy_rd_addr <= pad_xy_rd_addr_zero;
-                
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
-                pad_xy_rd_addr <= pad_xy_rd_addr_next;
-                
-            default:
-                pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
-        
-        endcase
-  
-  
-    
-    
-    //
-    // Flags
-    //
-
-    wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
-            
-    always @*
-        //
-        fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
-    
-    
-    //
-    // MAC Arrays
-    //
-    reg                                mac_x_ce = 1'b0;
-    reg                                mac_x_ce_aux = 1'b0;
-    reg  [NUM_MULTS              -1:0] mac_x_clr;
-    reg                                mac_x_clr_aux;
-    reg  [NUM_MULTS              -2:0] mac_x_casc_a;
-    reg                                mac_x_casc_a_aux;
-    wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
-    reg  [        1 * WORD_WIDTH -1:0] mac_x_a_aux;
-    //wire [        1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
-    reg  [        1 * WORD_WIDTH -1:0] mac_x_b;
-    wire [NUM_MULTS * MAC_WIDTH  -1:0] mac_x_p;
-    wire [        1 * MAC_WIDTH  -1:0] mac_x_p_aux;
-
-    reg                                mac_y_ce = 1'b0;
-    reg                                mac_y_ce_aux = 1'b0;
-    reg  [NUM_MULTS              -1:0] mac_y_clr;
-    reg                                mac_y_clr_aux;
-    reg  [NUM_MULTS              -2:0] mac_y_casc_a;
-    reg                                mac_y_casc_a_aux;
-    wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
-    reg  [        1 * WORD_WIDTH -1:0] mac_y_a_aux;
-    //wire [        1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
-    reg  [        1 * WORD_WIDTH -1:0] mac_y_b;
-    wire [NUM_MULTS * MAC_WIDTH  -1:0] mac_y_p;
-    wire [        1 * MAC_WIDTH  -1:0] mac_y_p_aux;
-    
-    modexpng_mac_array mac_array_x
-    (
-        .clk        (clk),
-        .ce         (mac_x_ce),
-        .ce_aux     (mac_x_ce_aux),
-        .clr        (mac_x_clr),
-        .clr_aux    (mac_x_clr_aux),
-        .casc_a     (mac_x_casc_a),
-        .casc_a_aux (mac_x_casc_a_aux),
-        .a_in       (mac_x_a),
-        .a_in_aux   (mac_x_a_aux),
-        .b_in       (mac_x_b),
-        .p_out      (mac_x_p),
-        .p_out_aux  (mac_x_p_aux)
-    );
-
-    modexpng_mac_array mac_array_y
-    (
-        .clk        (clk),
-        .ce         (mac_y_ce),
-        .ce_aux     (mac_y_ce_aux),
-        .clr        (mac_y_clr),
-        .clr_aux    (mac_y_clr_aux),
-        .casc_a     (mac_y_casc_a),
-        .casc_a_aux (mac_y_casc_a_aux),
-        .a_in       (mac_y_a),
-        .a_in_aux   (mac_y_a_aux),
-        .b_in       (mac_y_b),
-        .p_out      (mac_y_p),
-        .p_out_aux  (mac_y_p_aux)
-    );
-
-    genvar gen_z;
-   
-    generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
-        begin : gen_xy_din
-            //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
-            //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
-            //gen_xy_dout
-            assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
-            
-            //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
-            //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
-        end
-    endgenerate
-    
-
-    //
-    // MAC Clock Enable Logic
-    //
-    reg mac_xy_ce_adv = 1'b0;
-    
-    always @(posedge clk or negedge rst_n)
-        //
-        if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
-        else case (fsm_state)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
-            default:                          mac_xy_ce_adv <= 1'b0;
-        endcase
-    
-    always @(posedge clk or negedge rst_n)
-        //
-        if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
-        else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
-
-
-    //
-    // MAC Clear Logic
-    //
-    wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
-        calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
-    
-    reg [NUM_MULTS-1:0] mac_xy_clr_adv;
-    
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
-            default:                          mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
-        endcase
-        
-    always @(posedge clk)
-        //
-        {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
-
-
-    //
-    // MAC Cascade Logic
-    //
-    reg  [NUM_MULTS-2:0] mac_xy_casc_a_adv;
-
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
-            default:                          mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
-        endcase
-
-    always @(posedge clk)
-        //
-        {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
-
-
-
-    //
-    // DOUT Mapping
-    //
-    generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
-        begin : gen_xy_dout
-            assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
-            assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
-        end
-    endgenerate
-    
-    
-    //
-    // DOUT
-    //
-    reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
-    reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
-    
-    
-    
-
-    integer int_z;
-    always @(posedge clk)
-        //
-        case (fsm_state)
-            //
-            FSM_STATE_LOAD_T1T2_3,
-            FSM_STATE_LOAD_NN_COEFF_3:
-                for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
-                    x_dout_reg[int_z] <= load_x_din;
-                    y_dout_reg[int_z] <= load_y_din;
-                end
-            //
-            default:
-                for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
-                    x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
-                    y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
-                end
-            //
-        endcase
-    
-
-
-    //
-    // FSM Process
-    //
-    always @(posedge clk or negedge rst_n)
-        //
-        if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
-        else               fsm_state <= fsm_state_next;
-
-
-    //
-    // FSM Transition Logic
-    //
-    always @* begin
-        //
-        fsm_state_next = FSM_STATE_IDLE;
-        //
-        case (fsm_state)
-            FSM_STATE_IDLE:                   fsm_state_next = ena                       ? fsm_state_after_idle             : FSM_STATE_IDLE;
-            
-            FSM_STATE_LOAD_T1T2_1:            fsm_state_next = FSM_STATE_LOAD_T1T2_2     ;
-            FSM_STATE_LOAD_T1T2_2:            fsm_state_next = FSM_STATE_LOAD_T1T2_3     ;
-            FSM_STATE_LOAD_T1T2_3:            fsm_state_next = load_t1t2_addr_done       ? FSM_STATE_LOAD_NN_COEFF_1        : FSM_STATE_LOAD_T1T2_1;
-            
-            FSM_STATE_LOAD_NN_COEFF_1:        fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
-            FSM_STATE_LOAD_NN_COEFF_2:        fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
-            FSM_STATE_LOAD_NN_COEFF_3:        fsm_state_next = load_nn_coeff_addr_done   ? FSM_STATE_STOP                   : FSM_STATE_LOAD_NN_COEFF_1;
-            
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                             FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done     ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                             FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done     ? fsm_state_after_mult_square      : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
-            
-            /*
-            FSM_STATE_TRIANGLE_COL_0_TRIG:  fsm_state_next =                        FSM_STATE_TRIANGLE_COL_0_BUSY  ;
-            FSM_STATE_TRIANGLE_COL_0_BUSY:  fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG  : FSM_STATE_TRIANGLE_COL_0_BUSY;
-            FSM_STATE_TRIANGLE_COL_N_TRIG:  fsm_state_next =                        FSM_STATE_TRIANGLE_COL_N_BUSY  ;
-            FSM_STATE_TRIANGLE_COL_N_BUSY:  fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle       : FSM_STATE_TRIANGLE_COL_N_BUSY;
-            
-            FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next =                        FSM_STATE_RECTANGLE_COL_0_BUSY ;
-            FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
-            FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next =                        FSM_STATE_RECTANGLE_COL_N_BUSY ; 
-            FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle      : FSM_STATE_RECTANGLE_COL_N_BUSY;
-            */
-            
-            FSM_STATE_STOP:                 fsm_state_next =                        FSM_STATE_IDLE                 ;
-
-        endcase
-        //
-    end
-
-
-    //
-    // Ready Output
-    //
-    reg rdy_reg = 1'b1;
-    assign rdy = rdy_reg;
-
-    always @(posedge clk or negedge rst_n)
-        //
-        if (rst_n == 1'b0)           rdy_reg <= 1'b1;
-        else case (fsm_state)
-            FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
-            FSM_STATE_STOP:          rdy_reg <= 1'b1;
-        endcase
-
-    function  [  NUM_MULTS-1:0] calc_mac_clear_square;
-        input [INDEX_WIDTH-4:0] col_index_delayed;
-        input [          3-1:0] x_din_addr_cnt_lower_delayed;
-        input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
-        begin
-            if (x_din_addr_cnt_upper_delayed == col_index_delayed)
-                case (x_din_addr_cnt_lower_delayed)
-                    3'b000: calc_mac_clear_square = 8'b00000001;
-                    3'b001: calc_mac_clear_square = 8'b00000010;
-                    3'b010: calc_mac_clear_square = 8'b00000100;
-                    3'b011: calc_mac_clear_square = 8'b00001000;
-                    3'b100: calc_mac_clear_square = 8'b00010000;
-                    3'b101: calc_mac_clear_square = 8'b00100000;
-                    3'b110: calc_mac_clear_square = 8'b01000000;
-                    3'b111: calc_mac_clear_square = 8'b10000000;
-                endcase
-            else
-                calc_mac_clear_square = {NUM_MULTS{1'b0}};
-        end
-    endfunction
-
- 
-endmodule
diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh
deleted file mode 100644
index 77b57f3..0000000
--- a/rtl/modexpng_parameters.vh
+++ /dev/null
@@ -1,39 +0,0 @@
-//localparam WORD_WIDTH  = 17;
-//localparam MAC_WIDTH   = 47;
-
-//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere!
-
-localparam [2:0] BANK_FAT_T1T2      = 3'd0;
-localparam [2:0] BANK_FAT_ABL       = 3'd1;
-localparam [2:0] BANK_FAT_ABH       = 3'd2;
-localparam [2:0] BANK_FAT_N         = 3'd3;
-localparam [2:0] BANK_FAT_ML        = 3'd4; // not needed
-localparam [2:0] BANK_FAT_MH        = 3'd5; // not needed
-localparam [2:0] BANK_FAT_EXT       = 3'd6; // 0 -> MH'
-localparam [2:0] BANK_FAT_UNUSED    = 3'd7; // not needed
-
-localparam [1:0] BANK_SLIM_T1T2     = 2'd0;
-localparam [1:0] BANK_SLIM_N_COEFF  = 2'd1;
-localparam [1:0] BANK_SLIM_Q        = 2'd2;
-localparam [1:0] BANK_SLIM_EXT      = 2'd3; // 0 -> N_COEFF', 1 -> Q'
-
-
-//localparam BANK_Y_T2      = 3'd0;
-//localparam BANK_XY_T1T2   = 3'd0;
-
-//localparam BANK_XY_AB_LSB = 3'd1;
-//localparam BANK_XY_AB_MSB = 3'd2;
-
-//localparam BANK_X_N       = 3'd3;
-//localparam BANK_Y_N_COEFF = 3'd3;
-
-//localparam BANK_XY_M      = 3'd4;
-
-//localparam BANK_XY_Q_LSB  = 3'd5;
-//localparam BANK_XY_Q_MSB  = 3'd6;
-
-//localparam BANK_XY_AUX    = 3'd7;
-
-//localparam BANK_XY_ANY    = 3'bXXX;
-
-//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_old.vh b/rtl/modexpng_parameters_old.vh
new file mode 100644
index 0000000..d30b751
--- /dev/null
+++ b/rtl/modexpng_parameters_old.vh
@@ -0,0 +1,40 @@
+
+//localparam WORD_WIDTH  = 17;
+//localparam MAC_WIDTH   = 47;
+
+localparam BANK_ADDR_WIDTH = 2; // TODO: Replace everywhere!
+
+localparam [1:0] BANK_WIDE_T1T2      = 2'd0;
+localparam [1:0] BANK_WIDE_ABL       = 2'd1;
+localparam [1:0] BANK_WIDE_ABH       = 2'd2;
+localparam [1:0] BANK_WIDE_N         = 2'd3;
+
+localparam [1:0] BANK_RCMB_ML        = 2'd0;
+localparam [1:0] BANK_RCMB_MH        = 2'd1;
+localparam [1:0] BANK_RCMB_EXT       = 2'd2; // 0 -> MH'
+
+localparam [1:0] BANK_NARROW_T1T2     = 2'd0;
+localparam [1:0] BANK_NARROW_N_COEFF  = 2'd1;
+localparam [1:0] BANK_NARROW_Q        = 2'd2;
+localparam [1:0] BANK_NARROW_EXT      = 2'd3; // 0 -> N_COEFF', 1 -> Q'
+
+
+//localparam BANK_Y_T2      = 3'd0;
+//localparam BANK_XY_T1T2   = 3'd0;
+
+//localparam BANK_XY_AB_LSB = 3'd1;
+//localparam BANK_XY_AB_MSB = 3'd2;
+
+//localparam BANK_X_N       = 3'd3;
+//localparam BANK_Y_N_COEFF = 3'd3;
+
+//localparam BANK_XY_M      = 3'd4;
+
+//localparam BANK_XY_Q_LSB  = 3'd5;
+//localparam BANK_XY_Q_MSB  = 3'd6;
+
+//localparam BANK_XY_AUX    = 3'd7;
+
+//localparam BANK_XY_ANY    = 3'bXXX;
+
+//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8_old.vh
similarity index 100%
rename from rtl/modexpng_parameters_x8.vh
rename to rtl/modexpng_parameters_x8_old.vh
diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v
deleted file mode 100644
index 957ba8e..0000000
--- a/rtl/modexpng_part_recombinator.v
+++ /dev/null
@@ -1,1128 +0,0 @@
-module modexpng_part_recombinator
-(
-    clk,
-    rdy,
-    fsm_state_next,
-    index_last,
-    dsp_x_ce_p, dsp_y_ce_p,
-    ena_x,   ena_y,
-    dsp_x_p, dsp_y_p,
-    col_index, col_index_last,
-    slim_bram_xy_addr, slim_bram_xy_bank,
-    rcmb_fat_bram_xy_bank,  rcmb_fat_bram_xy_addr,  rcmb_fat_bram_x_dout,  rcmb_fat_bram_y_dout,  rcmb_fat_bram_xy_dout_valid,
-    rcmb_slim_bram_xy_bank, rcmb_slim_bram_xy_addr, rcmb_slim_bram_x_dout, rcmb_slim_bram_y_dout, rcmb_slim_bram_xy_dout_valid
-);
-
-
-    //
-    // Headers
-    //
-    `include "../rtl/modexpng_mmm_fsm.vh"
-    `include "../rtl/modexpng_parameters.vh"
-    `include "../rtl/modexpng_parameters_x8.vh"
-
-
-    input                        clk;
-    output                       rdy;
-    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
-    input [7:0]                  index_last;
-    input                        dsp_x_ce_p;
-    input                        dsp_y_ce_p;
-    input                        ena_x;
-    input                        ena_y;
-    input  [9*47-1:0] dsp_x_p;
-    input  [9*47-1:0] dsp_y_p;
-    input  [     4:0] col_index;
-    input  [     4:0] col_index_last;
-    input  [     7:0] slim_bram_xy_addr;
-    input  [     1:0] slim_bram_xy_bank;
-
-    output [     2:0] rcmb_fat_bram_xy_bank;
-    output [     7:0] rcmb_fat_bram_xy_addr;
-    output [    17:0] rcmb_fat_bram_x_dout;
-    output [    17:0] rcmb_fat_bram_y_dout;
-    output            rcmb_fat_bram_xy_dout_valid;
-
-    output [     2:0] rcmb_slim_bram_xy_bank;
-    output [     7:0] rcmb_slim_bram_xy_addr;
-    output [    17:0] rcmb_slim_bram_x_dout;
-    output [    17:0] rcmb_slim_bram_y_dout;
-    output            rcmb_slim_bram_xy_dout_valid;
-
-
-    //
-    // Latches
-    //
-    reg  [1*47-1:0] dsp_x_p_latch[0:8];
-    reg  [1*47-1:0] dsp_y_p_latch[0:8];
-
-
-    //
-    // Mapping
-    //
-    wire [46:0] dsp_x_p_split[0:8];
-    wire [46:0] dsp_y_p_split[0:8];
-    
-    genvar z;
-    generate for (z=0; z<(NUM_MULTS+1); z=z+1)
-        begin : gen_dsp_xy_p_split
-            assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
-            assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
-        end
-    endgenerate
-
-
-    //
-    // Delays
-    //
-    reg dsp_y_ce_p_dly1 = 1'b0;
-    reg dsp_x_ce_p_dly1 = 1'b0;
-
-    always @(posedge clk) begin
-        //
-        {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p,      dsp_x_ce_p};
-        //
-    end
-
-
-    //
-    // Registers
-    //
-    
-    // valid
-    reg       x_valid_lsb = 1'b0;
-    reg       y_valid_lsb = 1'b0;
-    reg       x_aux_lsb   = 1'b0;
-    reg       y_aux_lsb   = 1'b0;
-    reg       x_valid_msb = 1'b0;
-    reg       y_valid_msb = 1'b0;
-    
-    // bitmap
-    reg [7:0] x_bitmap_lsb = {8{1'b0}};
-    reg [7:0] y_bitmap_lsb = {8{1'b0}};
-    reg [7:0] x_bitmap_msb = {8{1'b0}};
-    reg [7:0] y_bitmap_msb = {8{1'b0}};
-    
-    // index
-    reg [2:0] x_index_lsb = 3'dX;
-    reg [2:0] y_index_lsb = 3'dX;
-    
-    // purge
-    reg       x_purge_lsb = 1'b0;
-    reg       y_purge_lsb = 1'b0;
-    reg       x_purge_msb = 1'b0;
-    reg       y_purge_msb = 1'b0;
-    
-    // valid - latch
-    reg       x_valid_latch_lsb = 1'b0;
-    reg       y_valid_latch_lsb = 1'b0;
-    
-    // aux - latch
-    reg       x_aux_latch_lsb = 1'b0;
-    reg       y_aux_latch_lsb = 1'b0;
-    
-    // bitmap - latch
-    reg [7:0] x_bitmap_latch_lsb = {8{1'b0}};
-    reg [7:0] y_bitmap_latch_lsb = {8{1'b0}};
-    reg [7:0] x_bitmap_latch_msb = {8{1'b0}};
-    reg [7:0] y_bitmap_latch_msb = {8{1'b0}};
-
-    // index - latch
-    reg [2:0] x_index_latch_lsb = 3'dX;
-    reg [2:0] y_index_latch_lsb = 3'dX;
-    
-    // purge - index
-    reg       x_purge_latch_lsb = 1'b0;
-    reg       y_purge_latch_lsb = 1'b0;
-    reg       x_purge_latch_msb = 1'b0;
-    reg       y_purge_latch_msb = 1'b0;
-
-    // 
-    reg       xy_valid_lsb_adv[1:6];
-    reg       xy_valid_msb_adv[1:6];
-    reg       xy_aux_lsb_adv[1:6];
-    reg [7:0] xy_bitmap_lsb_adv[1:6];
-    reg [7:0] xy_bitmap_msb_adv[1:6];
-    reg [2:0] xy_index_lsb_adv[1:6];
-    reg [2:0] xy_index_msb_adv[1:6];
-    reg       xy_purge_lsb_adv[1:6];
-    reg       xy_purge_msb_adv[1:6];
-    
-    reg [1:0] rcmb_mode;
-       
-    always @(posedge clk)
-       //
-       if (ena_x && ena_y)
-           //
-           case (fsm_state_next)
-               FSM_STATE_MULT_SQUARE_COL_0_BUSY:        rcmb_mode <= 2'd1;
-               FSM_STATE_MULT_TRIANGLE_COL_0_BUSY:      rcmb_mode <= 2'd2;
-               FSM_STATE_MULT_RECTANGLE_COL_0_BUSY:     rcmb_mode <= 2'd3;
-               default:                                 rcmb_mode <= 2'd0;
-           endcase
-
-               
-    integer i;
-    initial for (i=1; i<6; i=i+1) begin
-        xy_valid_lsb_adv[i] = 1'b0;
-        xy_valid_msb_adv[i] = 1'b0;
-        xy_aux_lsb_adv[i] = 1'b0;
-        xy_bitmap_lsb_adv[i] = {8{1'b0}};
-        xy_bitmap_msb_adv[i] = {8{1'b0}};
-        xy_index_lsb_adv[i] = 3'dX;
-        xy_index_msb_adv[i] = 3'dX;
-        xy_purge_lsb_adv[i] = 1'b0;
-        xy_purge_msb_adv[i] = 1'b0;
-    end
-    
-    function        calc_square_valid_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                calc_square_valid_lsb = 1'b1;
-            else
-                calc_square_valid_lsb = 1'b0;
-            //
-        end
-    endfunction
-    
-    function        calc_triangle_valid_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                calc_triangle_valid_lsb = 1'b1;
-            else
-                calc_triangle_valid_lsb = 1'b0;
-            //
-        end
-    endfunction
-
-    function        calc_triangle_aux_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        begin
-            //
-            if (slim_bram_xy_bank_value == BANK_SLIM_EXT)
-                calc_triangle_aux_lsb = 1'b1;
-            else
-                calc_triangle_aux_lsb = 1'b0;
-            //
-            //if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                //calc_triangle_aux_lsb = 1'b1;
-            //else
-                //calc_triangle_aux_lsb = 1'b0;
-            //
-        end
-    endfunction
-    
-    function        calc_rectangle_valid_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value) 
-                calc_rectangle_valid_lsb = slim_bram_xy_bank_value != BANK_SLIM_EXT;
-            else
-                calc_rectangle_valid_lsb = 1'b0;
-            //
-        end
-    endfunction
-    
-    function  [7:0] calc_square_bitmap_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                //
-                case (slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_square_bitmap_lsb = 8'b00000001;
-                    3'b001: calc_square_bitmap_lsb = 8'b00000010;
-                    3'b010: calc_square_bitmap_lsb = 8'b00000100;
-                    3'b011: calc_square_bitmap_lsb = 8'b00001000;
-                    3'b100: calc_square_bitmap_lsb = 8'b00010000;
-                    3'b101: calc_square_bitmap_lsb = 8'b00100000;
-                    3'b110: calc_square_bitmap_lsb = 8'b01000000;
-                    3'b111: calc_square_bitmap_lsb = 8'b10000000;
-                endcase
-                //
-            else
-                calc_square_bitmap_lsb = {8{1'b0}};
-            //
-        end
-    endfunction
-    
-    function  [7:0] calc_triangle_bitmap_lsb;
-           input [4:0] col_index_value;
-           input [4:0] col_index_last_value;
-           input [7:0] slim_bram_xy_addr_value;
-           begin
-               //
-               if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                   //
-                   case (slim_bram_xy_addr_value[2:0])
-                       3'b000: calc_triangle_bitmap_lsb = 8'b00000001;
-                       3'b001: calc_triangle_bitmap_lsb = 8'b00000010;
-                       3'b010: calc_triangle_bitmap_lsb = 8'b00000100;
-                       3'b011: calc_triangle_bitmap_lsb = 8'b00001000;
-                       3'b100: calc_triangle_bitmap_lsb = 8'b00010000;
-                       3'b101: calc_triangle_bitmap_lsb = 8'b00100000;
-                       3'b110: calc_triangle_bitmap_lsb = 8'b01000000;
-                       3'b111: calc_triangle_bitmap_lsb = 8'b10000000;
-                   endcase
-                   //
-               else
-                   calc_triangle_bitmap_lsb = {8{1'b0}};
-               //
-           end
-       endfunction
-
-    function  [7:0] calc_rectangle_bitmap_lsb;
-           input [4:0] col_index_value;
-           input [4:0] col_index_last_value;
-           input [7:0] slim_bram_xy_addr_value;
-           input [1:0] slim_bram_xy_bank_value;
-           begin
-               //
-               if ((slim_bram_xy_addr_value[7:3] == col_index_value) && (slim_bram_xy_bank_value != BANK_SLIM_EXT))
-                   //
-                   case (slim_bram_xy_addr_value[2:0])
-                       3'b000: calc_rectangle_bitmap_lsb = 8'b00000001;
-                       3'b001: calc_rectangle_bitmap_lsb = 8'b00000010;
-                       3'b010: calc_rectangle_bitmap_lsb = 8'b00000100;
-                       3'b011: calc_rectangle_bitmap_lsb = 8'b00001000;
-                       3'b100: calc_rectangle_bitmap_lsb = 8'b00010000;
-                       3'b101: calc_rectangle_bitmap_lsb = 8'b00100000;
-                       3'b110: calc_rectangle_bitmap_lsb = 8'b01000000;
-                       3'b111: calc_rectangle_bitmap_lsb = 8'b10000000;
-                   endcase
-                   //
-               else
-                   calc_rectangle_bitmap_lsb = {8{1'b0}};
-               //
-           end
-       endfunction
-       
-    function  [2:0] calc_square_index_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                //
-                case (slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_square_index_lsb = 3'd0;
-                    3'b001: calc_square_index_lsb = 3'd1;
-                    3'b010: calc_square_index_lsb = 3'd2;
-                    3'b011: calc_square_index_lsb = 3'd3;
-                    3'b100: calc_square_index_lsb = 3'd4;
-                    3'b101: calc_square_index_lsb = 3'd5;
-                    3'b110: calc_square_index_lsb = 3'd6;
-                    3'b111: calc_square_index_lsb = 3'd7;
-                endcase
-                //
-            else
-                calc_square_index_lsb = 3'dX;
-            //
-        end
-    endfunction
-
-    function  [2:0] calc_triangle_index_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                //
-                case (slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_triangle_index_lsb = 3'd0;
-                    3'b001: calc_triangle_index_lsb = 3'd1;
-                    3'b010: calc_triangle_index_lsb = 3'd2;
-                    3'b011: calc_triangle_index_lsb = 3'd3;
-                    3'b100: calc_triangle_index_lsb = 3'd4;
-                    3'b101: calc_triangle_index_lsb = 3'd5;
-                    3'b110: calc_triangle_index_lsb = 3'd6;
-                    3'b111: calc_triangle_index_lsb = 3'd7;
-                endcase
-                //
-            else
-                calc_triangle_index_lsb = 3'dX;
-            //
-        end
-    endfunction
-
-    function  [2:0] calc_rectangle_index_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        begin
-            //
-            if ((slim_bram_xy_addr_value[7:3] == col_index_value) && (slim_bram_xy_bank_value != BANK_SLIM_EXT))
-                //
-                case (slim_bram_xy_addr_value[2:0])
-                    3'b000: calc_rectangle_index_lsb = 3'd0;
-                    3'b001: calc_rectangle_index_lsb = 3'd1;
-                    3'b010: calc_rectangle_index_lsb = 3'd2;
-                    3'b011: calc_rectangle_index_lsb = 3'd3;
-                    3'b100: calc_rectangle_index_lsb = 3'd4;
-                    3'b101: calc_rectangle_index_lsb = 3'd5;
-                    3'b110: calc_rectangle_index_lsb = 3'd6;
-                    3'b111: calc_rectangle_index_lsb = 3'd7;
-                endcase
-                //
-            else
-                calc_rectangle_index_lsb = 3'dX;
-            //
-        end
-    endfunction
-    
-    function        calc_square_purge_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
-            else
-                calc_square_purge_lsb = 1'b0;
-            //
-        end
-    endfunction
-
-    function        calc_rectangle_purge_lsb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value[7:3] == col_index_value)
-                calc_rectangle_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
-            else
-                calc_rectangle_purge_lsb = 1'b0;
-            //
-        end
-    endfunction
-
-    function        calc_square_valid_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value == index_last_value)
-                calc_square_valid_msb = 1'b1;
-            else
-                calc_square_valid_msb = 1'b0;
-            //
-        end
-    endfunction
-
-    function        calc_rectangle_valid_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if ((slim_bram_xy_addr_value == 8'd1) && (slim_bram_xy_bank_value == BANK_SLIM_EXT))
-                calc_rectangle_valid_msb = 1'b1;
-            else
-                calc_rectangle_valid_msb = 1'b0;
-            //
-        end
-    endfunction
-    
-    function  [7:0] calc_square_bitmap_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value == index_last_value) begin
-                calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
-                calc_square_bitmap_msb[6:0] = 7'b1111111;
-            end else
-                calc_square_bitmap_msb[7:0] = 8'b00000000;
-            //
-        end
-    endfunction
-
-    function  [7:0] calc_rectangle_bitmap_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if ((slim_bram_xy_addr_value == 8'd1) && (slim_bram_xy_bank_value == BANK_SLIM_EXT)) begin
-                calc_rectangle_bitmap_msb[7:0] = 8'b11111111;
-            end else
-                calc_rectangle_bitmap_msb[7:0] = 8'b00000000;
-            //
-        end
-    endfunction
-
-    function        calc_square_purge_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if (slim_bram_xy_addr_value == index_last_value)
-                calc_square_purge_msb = col_index_value == col_index_last_value;
-            else
-                calc_square_purge_msb = 1'b0;
-            //
-        end
-    endfunction
-
-    function        calc_rectangle_purge_msb;
-        input [4:0] col_index_value;
-        input [4:0] col_index_last_value;
-        input [7:0] slim_bram_xy_addr_value;
-        input [1:0] slim_bram_xy_bank_value;
-        input [7:0] index_last_value;
-        begin
-            //
-            if ((slim_bram_xy_addr_value == 8'd1) && (slim_bram_xy_bank_value == BANK_SLIM_EXT))
-                calc_rectangle_purge_msb = col_index_value == col_index_last_value;
-            else
-                calc_rectangle_purge_msb = 1'b0;
-            //
-        end
-    endfunction
-
-    
-    reg         recomb_lsb_ce = 1'b0;
-    reg         recomb_lsb_ce_aux;
-    reg  [ 2:0] recomb_lsb_ce_purge = 3'b000;
-    wire        recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_aux | recomb_lsb_ce_purge[0];
-    reg         recomb_lsb_clr;
-
-    reg  [46:0] recomb_lsb_din;
-    wire [15:0] recomb_lsb_dout;
-
-    reg         recomb_msb_ce = 1'b0;
-    reg  [ 1:0] recomb_msb_ce_purge = 2'b00;
-    wire        recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0];
-    reg         recomb_msb_clr;
-    
-    reg  [46:0] recomb_msb_din;
-    wire [15:0] recomb_msb_dout;
-    
-    modexpng_recombinator_block recomb_x_lsb
-    (
-        .clk    (clk),
-        .ce     (recomb_lsb_ce_combined),
-        .clr    (recomb_lsb_clr),
-        .din    (recomb_lsb_din),
-        .dout   (recomb_lsb_dout)
-    );
-
-    modexpng_recombinator_block recomb_x_msb
-    (
-        .clk    (clk),
-        .ce     (recomb_msb_ce_combined),
-        .clr    (recomb_msb_clr),
-        .din    (recomb_msb_din),
-        .dout   (recomb_msb_dout)
-    );
-
-    always @(posedge clk) begin
-        //
-        recomb_lsb_ce <= x_valid_latch_lsb;
-        recomb_lsb_ce_aux <= x_aux_latch_lsb;
-        recomb_msb_ce <= x_bitmap_latch_msb[0];
-        //
-        if (x_purge_latch_lsb)
-            recomb_lsb_ce_purge <= 3'b111;
-        else
-            recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]};
-        //
-        if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1])
-            recomb_msb_ce_purge = 2'b11;
-        else
-            recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]};
-        //
-    end
-
-
-    always @(posedge clk)
-        //
-        if (ena_x & ena_y) begin
-            recomb_lsb_clr <= 1'b1;
-            recomb_msb_clr <= 1'b1;
-        end else begin
-            if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0;
-            if (recomb_msb_ce) recomb_msb_clr <= 1'b0;
-        end
-
-    always @(posedge clk)
-        //
-        if (x_valid_latch_lsb)
-            recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb];
-        else if (x_aux_latch_lsb)
-            recomb_lsb_din <= dsp_x_p_latch[8];
-        else
-            recomb_lsb_din <= {47{1'b0}};
-
-    always @(posedge clk)
-        //
-        if (x_bitmap_latch_msb[0])
-            recomb_msb_din <= dsp_x_p_latch[0];
-        else
-            recomb_msb_din <= {47{1'b0}};
-
-
-    always @(posedge clk)
-        //
-        case (fsm_state_next)
-            //
-            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
-            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
-            FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
-                //
-                xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr);
-                xy_aux_lsb_adv   [6] <= 1'b0;
-                xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr);
-                xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr);
-                xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
-                //
-                xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
-                xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last);
-                xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
-                //
-            end
-            //
-            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: begin
-                //
-                xy_valid_lsb_adv [6] <= calc_triangle_valid_lsb (col_index, col_index_last, slim_bram_xy_addr); /// bank
-                xy_aux_lsb_adv   [6] <= calc_triangle_aux_lsb   (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank);
-                xy_bitmap_lsb_adv[6] <= calc_triangle_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr); //! bank
-                xy_index_lsb_adv [6] <= calc_triangle_index_lsb (col_index, col_index_last, slim_bram_xy_addr); // ! bank!!!
-                xy_purge_lsb_adv [6] <= 1'b0;
-                //
-                xy_valid_msb_adv [6] <= 1'b0;
-                xy_bitmap_msb_adv[6] <= {8{1'b0}};
-                xy_purge_msb_adv [6] <= 1'b0;
-                //
-            end
-            //
-            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
-            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
-            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: begin
-                //
-                xy_valid_lsb_adv [6] <= calc_rectangle_valid_lsb (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank);
-                xy_aux_lsb_adv   [6] <= 1'b0;
-                xy_bitmap_lsb_adv[6] <= calc_rectangle_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank);
-                xy_index_lsb_adv [6] <= calc_rectangle_index_lsb (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank);
-                xy_purge_lsb_adv [6] <= calc_rectangle_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
-                //
-                xy_valid_msb_adv [6] <= calc_rectangle_valid_msb (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank, index_last);
-                xy_bitmap_msb_adv[6] <= calc_rectangle_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank, index_last);
-                xy_purge_msb_adv [6] <= calc_rectangle_purge_msb (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank, index_last);
-                //
-            end
-            //
-            default: begin
-                //
-                xy_valid_lsb_adv [6] <= 1'b0;
-                xy_aux_lsb_adv   [6] <= 1'b0;
-                xy_bitmap_lsb_adv[6] <= {8{1'b0}};
-                xy_index_lsb_adv [6] <= 3'dX;
-                xy_purge_lsb_adv [6] <= 1'b0;
-                //
-                xy_valid_msb_adv [6] <= 1'b0;
-                xy_bitmap_msb_adv[6] <= {8{1'b0}};
-                xy_purge_msb_adv [6] <= 1'b0;
-                //
-            end
-            //
-        endcase
-
-
-    always @(posedge clk) begin
-        //
-        {y_valid_lsb,  x_valid_lsb}  <= {2{xy_valid_lsb_adv [1]}};
-        {y_aux_lsb,    x_aux_lsb}    <= {2{xy_aux_lsb_adv   [1]}};
-        {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}};
-        {y_index_lsb,  x_index_lsb}  <= {2{xy_index_lsb_adv [1]}};
-        {y_purge_lsb,  x_purge_lsb}  <= {2{xy_purge_lsb_adv [1]}};
-        //
-        {y_valid_latch_lsb,  x_valid_latch_lsb}  <= {y_valid_lsb,  x_valid_lsb};
-        {y_aux_latch_lsb,    x_aux_latch_lsb}    <= {y_aux_lsb,    x_aux_lsb};
-        {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb};
-        {y_index_latch_lsb,  x_index_latch_lsb}  <= {y_index_lsb,  x_index_lsb};
-        {y_purge_latch_lsb,  x_purge_latch_lsb}  <= {y_purge_lsb,  x_purge_lsb};
-        //
-        {y_valid_msb,  x_valid_msb}  <= {2{xy_valid_msb_adv[1]}};
-        {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}};
-        {y_purge_msb,  x_purge_msb}  <= {2{xy_purge_msb_adv[1]}};
-        //
-        if (x_valid_msb) begin
-            x_bitmap_latch_msb <= x_bitmap_msb;
-            x_purge_latch_msb  <= x_purge_msb;
-        end else begin
-            x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]};
-        end
-        //
-        //
-        for (i=1; i<6; i=i+1) begin
-            xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
-            xy_aux_lsb_adv   [i] <= xy_aux_lsb_adv   [i+1];
-            xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
-            xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
-            xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
-            //
-            xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
-            xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
-            xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
-        end
-        //
-    end
-
-    always @(posedge clk)
-        //
-        if (x_bitmap_latch_msb[1])   // only shift 7 times
-            //
-            for (i=0; i<8; i=i+1)            
-                if (i < 7)
-                    dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
-                else
-                    dsp_x_p_latch[i] <= {47{1'bX}};
-            //
-        else if (dsp_x_ce_p_dly1) begin
-            //
-            for (i=0; i<8; i=i+1)
-                //
-                if (x_bitmap_lsb[i])
-                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
-                else if (x_valid_msb && x_bitmap_msb[i])
-                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
-            //
-            if (x_aux_lsb)
-                dsp_x_p_latch[8] <= dsp_x_p_split[8];
-            //
-        end
-
-    reg recomb_x_lsb_dout_valid = 1'b0;
-    reg recomb_x_msb_dout_valid = 1'b0;
-
-    always @(posedge clk) begin
-        recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined;
-        recomb_x_msb_dout_valid <= recomb_msb_ce_combined;
-    end
-        
-
-    reg [ 2:0] fat_bram_xy_bank_reg;
-    reg [ 7:0] fat_bram_xy_addr_reg;
-    reg [17:0] fat_bram_x_dout_reg;
-    reg [17:0] fat_bram_y_dout_reg;
-    reg        fat_bram_xy_dout_valid_reg = 1'b0;
-
-    reg [ 2:0] slim_bram_xy_bank_reg;
-    reg [ 7:0] slim_bram_xy_addr_reg;
-    reg [17:0] slim_bram_x_dout_reg;
-    reg [17:0] slim_bram_y_dout_reg;
-    reg        slim_bram_xy_dout_valid_reg = 1'b0;
-
-    reg [ 7:0] bram_xy_cnt_lsb;
-    reg [ 7:0] bram_xy_cnt_msb;
-    
-    reg        bram_xy_cnt_lsb_wrapped;
-    reg        bram_xy_cnt_msb_wrapped;
-
-    reg [15:0] recomb_msb_dout_carry_0;
-    reg [15:0] recomb_msb_dout_carry_1;
-    
-    reg [15:0] recomb_msb_dout_delay_0;
-    reg [15:0] recomb_msb_dout_delay_1;
-    reg [15:0] recomb_msb_dout_delay_2;
-    
-    reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0;
-    reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0;
-    reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0;
-
-    reg        recomb_msb_flag_delay_0;
-    reg        recomb_msb_flag_delay_1;
-    reg        recomb_msb_flag_delay_2;
-
-    assign rcmb_fat_bram_xy_bank       = fat_bram_xy_bank_reg;
-    assign rcmb_fat_bram_xy_addr       = fat_bram_xy_addr_reg;
-    assign rcmb_fat_bram_x_dout        = fat_bram_x_dout_reg;
-    assign rcmb_fat_bram_y_dout        = fat_bram_y_dout_reg;
-    assign rcmb_fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg;
-
-    assign rcmb_slim_bram_xy_bank       = slim_bram_xy_bank_reg;
-    assign rcmb_slim_bram_xy_addr       = slim_bram_xy_addr_reg;
-    assign rcmb_slim_bram_x_dout        = slim_bram_x_dout_reg;
-    assign rcmb_slim_bram_y_dout        = slim_bram_y_dout_reg;
-    assign rcmb_slim_bram_xy_dout_valid = slim_bram_xy_dout_valid_reg;
-    
-    reg rdy_reg = 1'b1;
-    reg rdy_adv = 1'b1;
-    
-    assign rdy = rdy_reg;
-    
-    
-    always @(posedge clk)
-        //
-        if (ena_x & ena_y)
-            rdy_reg <= 1'b0;
-        else
-            rdy_reg <= rdy_adv;
-
-            
-    task advance_recomb_msb_dout_delay;
-        input [15:0] dout;
-        input [ 7:0] cnt;
-        input        flag;
-        begin
-            //
-            recomb_msb_dout_delay_0 <= dout;
-            recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
-            recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
-            //
-            recomb_msb_cnt_delay_0 <= cnt;
-            recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
-            recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
-            //
-            recomb_msb_flag_delay_0 <= flag;
-            recomb_msb_flag_delay_1 <= recomb_msb_flag_delay_0;
-            recomb_msb_flag_delay_2 <= recomb_msb_flag_delay_1;
-            //
-        end
-    endtask
-         
-    task shift_recomb_msb_dout_carry;
-        input [15:0] dout;
-        begin
-            recomb_msb_dout_carry_0 <= dout;
-            recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
-        end
-    endtask
-    
-    task _update_fat_bram_regs;
-        input [ 2:0] bank;
-        input [ 7:0] addr;
-        input [17:0] dout_x;
-        input [17:0] dout_y;
-        input        valid;
-        begin
-            fat_bram_xy_bank_reg       <= bank;
-            fat_bram_xy_addr_reg       <= addr;
-            fat_bram_x_dout_reg        <= dout_x;
-            fat_bram_y_dout_reg        <= dout_y;
-            fat_bram_xy_dout_valid_reg <= valid;
-        end
-    endtask
-    
-    task _update_slim_bram_regs;
-        input [ 2:0] bank;
-        input [ 7:0] addr;
-        input [17:0] dout_x;
-        input [17:0] dout_y;
-        input        valid;
-        begin
-            slim_bram_xy_bank_reg       <= bank;
-            slim_bram_xy_addr_reg       <= addr;
-            slim_bram_x_dout_reg        <= dout_x;
-            slim_bram_y_dout_reg        <= dout_y;
-            slim_bram_xy_dout_valid_reg <= valid;
-        end
-    endtask
-            
-    task set_fat_bram_regs;
-        input [ 2:0] bank;
-        input [ 7:0] addr;
-        input [17:0] dout_x;
-        input [17:0] dout_y;
-        begin
-            _update_fat_bram_regs(bank, addr, dout_x, dout_y, 1'b1);
-        end
-    endtask
-    
-    task set_slim_bram_regs;
-        input [ 2:0] bank;
-        input [ 7:0] addr;
-        input [17:0] dout_x;
-        input [17:0] dout_y;
-        begin
-            _update_slim_bram_regs(bank, addr, dout_x, dout_y, 1'b1);
-        end
-    endtask
-    
-    task clear_fat_bram_regs;
-        begin
-            _update_fat_bram_regs(3'bXXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
-        end
-    endtask
-
-    task clear_slim_bram_regs;
-        begin
-            _update_slim_bram_regs(3'bXXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
-        end
-    endtask
-    
-    task _set_bram_cnt_lsb;
-        input [7:0] cnt;
-        input       wrapped;
-        begin
-            bram_xy_cnt_lsb <= cnt;
-            bram_xy_cnt_lsb_wrapped <= wrapped;
-        end
-    endtask
-    
-    task _set_bram_cnt_msb;
-        input [7:0] cnt;
-        input       wrapped;
-        begin
-            bram_xy_cnt_msb <= cnt;
-            bram_xy_cnt_msb_wrapped <= wrapped;
-        end
-    endtask    
-
-    task inc_bram_cnt_lsb;
-        begin
-            if (bram_xy_cnt_lsb == index_last)
-                _set_bram_cnt_lsb(8'd0, 1'b1);
-            else
-                _set_bram_cnt_lsb(bram_xy_cnt_lsb + 1'b1, bram_xy_cnt_lsb_wrapped);
-        end
-    endtask
-    
-    task inc_bram_cnt_msb;
-        begin
-            if (bram_xy_cnt_msb == index_last)
-                _set_bram_cnt_msb(8'd0, 1'b1);
-            else
-                _set_bram_cnt_msb(bram_xy_cnt_msb + 1'b1, bram_xy_cnt_msb_wrapped);
-        end
-    endtask
-    
-    task clr_bram_cnt_lsb;
-        begin
-            _set_bram_cnt_lsb(8'd0, 1'b0);
-        end
-    endtask
-    
-    task clr_bram_cnt_msb;
-        begin
-            _set_bram_cnt_msb(8'd0, 1'b0);
-        end
-    endtask
-    
-    
-   
-    
-
-    wire [1:0] rcmb_xy_dout_valid = {recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid}; 
-    
-    always @(posedge clk)
-        //
-        if (ena_x & ena_y) begin
-            clr_bram_cnt_lsb();
-            clr_bram_cnt_msb();
-        end else begin  // if not ready???
-            //
-            case (rcmb_mode)
-                2'd1: recombine_square();
-                2'd2: recombine_triangle();
-                2'd3: recombine_rectangle();
-            endcase
-            //
-        end
-           
-    task recombine_square;
-        //
-        begin
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b01: inc_bram_cnt_lsb(); 
-                2'b10: inc_bram_cnt_msb();
-                2'b11: begin
-                    inc_bram_cnt_lsb();
-                    inc_bram_cnt_msb();
-                end
-                //
-            endcase            
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b00:  if (recomb_msb_flag_delay_2)  set_fat_bram_regs(BANK_FAT_ABH, recomb_msb_cnt_delay_2, {2'b00, recomb_msb_dout_delay_2}, {18{1'bX}});
-                        else                                clear_fat_bram_regs();
-                  2'b01:                                      set_fat_bram_regs(BANK_FAT_ABL, bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); 
-                  2'b10:  if (bram_xy_cnt_msb < 8'd2)         clear_fat_bram_regs();                        
-                        else                                set_fat_bram_regs(BANK_FAT_ABH, bram_xy_cnt_msb, {2'b00, recomb_msb_dout}, {18{1'bX}});                        
-                2'b11:  if (bram_xy_cnt_lsb_wrapped)   set_fat_bram_regs(BANK_FAT_ABH, bram_xy_cnt_lsb, {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}, {18{1'bX}}); 
-                        else                                set_fat_bram_regs(BANK_FAT_ABL, bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}});
-                default:    clear_fat_bram_regs();  // DEBUG!!!
-                //
-            endcase            
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b00:  if (recomb_msb_flag_delay_2)  advance_recomb_msb_dout_delay(16'hXXXX, 8'd0, 1'b0);
-                2'b10:  if (bram_xy_cnt_msb < 8'd2)         shift_recomb_msb_dout_carry(recomb_msb_dout);
-//                //
-                2'b11:  begin                          advance_recomb_msb_dout_delay(recomb_msb_dout, bram_xy_cnt_msb, 1'b1);
-                        if (bram_xy_cnt_lsb_wrapped)   shift_recomb_msb_dout_carry({16{1'bX}});
-                        end
-                //
-            endcase
-            //        
-        end
-        //
-    endtask
-    
-    
-    task recombine_triangle;
-        //
-        begin
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b01: inc_bram_cnt_lsb(); 
-               //
-            endcase            
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b00:  clear_slim_bram_regs();
-                2'b01:  if (!bram_xy_cnt_lsb_wrapped) set_slim_bram_regs(BANK_SLIM_Q,   bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); 
-                        else                         set_slim_bram_regs(BANK_SLIM_EXT, 8'd1, {2'b00, recomb_lsb_dout}, {18{1'bX}});
-                2'b10:  clear_slim_bram_regs();
-                2'b11:  clear_slim_bram_regs();
-                //
-            endcase
-            //        
-        end
-        //
-    endtask
-
-
-    task recombine_rectangle;
-        //
-        begin
-            //
-            case (rcmb_xy_dout_valid)
-                //
-                2'b01: inc_bram_cnt_lsb(); 
-                2'b10: inc_bram_cnt_msb();
-                2'b11: begin
-                    inc_bram_cnt_lsb();
-                    inc_bram_cnt_msb();
-                end
-                //
-            endcase
-//            //
-            case (rcmb_xy_dout_valid)
-//                //
-                2'b00:  if (recomb_msb_flag_delay_2)  set_fat_bram_regs(BANK_FAT_MH, recomb_msb_cnt_delay_2, {2'b00, recomb_msb_dout_delay_2}, {18{1'bX}});
-                        else                                clear_fat_bram_regs();
-                2'b01:                                      set_fat_bram_regs(BANK_FAT_ML, bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); 
-                2'b10:  if (!bram_xy_cnt_msb_wrapped) begin 
-                            if (bram_xy_cnt_msb < 8'd2)         clear_fat_bram_regs();                        
-                            else                                set_fat_bram_regs(BANK_FAT_MH, bram_xy_cnt_msb, {2'b00, recomb_msb_dout}, {18{1'bX}});
-                        end else
-                                                                set_fat_bram_regs(BANK_FAT_EXT, 8'd0, {2'b00, recomb_msb_dout}, {18{1'bX}});
-                            
-                2'b11:  set_fat_bram_regs(BANK_FAT_MH, bram_xy_cnt_lsb, {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}, {18{1'bX}}); 
-//                //
-            endcase            
-//            //
-            case (rcmb_xy_dout_valid)
-//                //
-                2'b00:  if (recomb_msb_flag_delay_2)  advance_recomb_msb_dout_delay(16'hXXXX, 8'd0, 1'b0);
-                2'b10:  begin 
-                            if ((bram_xy_cnt_msb < 8'd2) && !bram_xy_cnt_msb_wrapped)         shift_recomb_msb_dout_carry(recomb_msb_dout);
-                            if (bram_xy_cnt_msb_wrapped) advance_recomb_msb_dout_delay(16'hXXXX, 8'd0, 1'b0);
-                        end
-//                //
-                2'b11:  begin  advance_recomb_msb_dout_delay(recomb_msb_dout, bram_xy_cnt_msb, 1'b1);
-                                   shift_recomb_msb_dout_carry({16{1'bX}});
-                        end
-//                //
-            endcase
-            //
-        end
-        //
-    endtask
-    
-    
-    always @(posedge clk)
-        //
-        if (ena_x & ena_y) begin
-            rdy_adv <= 1'b0;
-        end else if (!rdy_reg) begin
-            //
-            case (rcmb_mode)
-                //
-                2'd1:   case (rcmb_xy_dout_valid)
-                            //
-                            2'b00: begin
-                                //
-                                if (recomb_msb_flag_delay_2) begin
-                                    //
-                                    rdy_adv <= ~recomb_msb_flag_delay_1;
-                                    //
-                                end
-                                //
-                            end
-                            //
-                        endcase
-                //
-                2'd2:   case (rcmb_xy_dout_valid)
-                            //
-                            2'b01: rdy_adv <= bram_xy_cnt_lsb_wrapped;                                //
-                            //
-                        endcase
-                //
-                2'd3: case (rcmb_xy_dout_valid)
-                                            //
-                                            2'b00: begin
-                                                //
-                                                if (recomb_msb_flag_delay_2) begin
-                                                    //
-                                                    rdy_adv <= ~recomb_msb_flag_delay_1;
-                                                    //
-                                                end
-                                                //
-                                            end
-                                            //
-                                        endcase
-                //
-            endcase
-            //        
-        end
-
-
-    
-        // add ready for mode=3
-endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
index efe0ac5..d6b1ad1 100644
--- a/rtl/modexpng_recombinator_block.v
+++ b/rtl/modexpng_recombinator_block.v
@@ -1,35 +1,1225 @@
 module modexpng_recombinator_block
 (
-    clk,
-    ce, clr,
-    din, dout
+    clk, rst,
+    ena, rdy,
+    fsm_state_next,
+    word_index_last,
+    dsp_xy_ce_p,
+    dsp_x_p, dsp_y_p,
+    col_index, col_index_last,
+    rd_narrow_xy_addr, rd_narrow_xy_bank,
+    rcmb_wide_xy_bank,   rcmb_wide_xy_addr,   rcmb_wide_x_dout,   rcmb_wide_y_dout,   rcmb_wide_xy_valid,
+    rcmb_narrow_xy_bank, rcmb_narrow_xy_addr, rcmb_narrow_x_dout, rcmb_narrow_y_dout, rcmb_narrow_xy_valid,
+    rdct_narrow_xy_bank, rdct_narrow_xy_addr, rdct_narrow_x_dout, rdct_narrow_y_dout, rdct_narrow_xy_valid
 );
 
-    input         clk;
-    input         ce;
-    input         clr;
-    input  [46:0] din;
-    output [15:0] dout;
 
-    reg [14:0] z;
-    reg [16:0] y;
-    reg [17:0] x;
-    //reg [15:0] w;
+    //
+    // Headers
+    //
+    `include "../rtl_1/modexpng_mmm_fsm_old.vh"
+    `include "../rtl_1/modexpng_parameters_old.vh"
+    `include "../rtl_1/modexpng_parameters_x8_old.vh"
 
-    //assign dout = w;
-    assign dout = x[15:0];
+
+    input                        clk;
+    input                        rst;
+    input                        ena;
+    output                       rdy;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    input [7:0]                  word_index_last;
+    input                        dsp_xy_ce_p;
+    input  [9*47-1:0] dsp_x_p;
+    input  [9*47-1:0] dsp_y_p;
+    input  [     4:0] col_index;
+    input  [     4:0] col_index_last;
+    
+    input  [     7:0] rd_narrow_xy_addr;
+    input  [     1:0] rd_narrow_xy_bank;
+
+    output [     1:0] rcmb_wide_xy_bank;
+    output [     7:0] rcmb_wide_xy_addr;
+    output [    17:0] rcmb_wide_x_dout;
+    output [    17:0] rcmb_wide_y_dout;
+    output            rcmb_wide_xy_valid;
+
+    output [     1:0] rcmb_narrow_xy_bank;
+    output [     7:0] rcmb_narrow_xy_addr;
+    output [    17:0] rcmb_narrow_x_dout;
+    output [    17:0] rcmb_narrow_y_dout;
+    output            rcmb_narrow_xy_valid;
+
+    output [     1:0] rdct_narrow_xy_bank;
+    output [     7:0] rdct_narrow_xy_addr;
+    output [    17:0] rdct_narrow_x_dout;
+    output [    17:0] rdct_narrow_y_dout;
+    output            rdct_narrow_xy_valid;
+
+
+    //
+    // Latches
+    //
+    reg  [1*47-1:0] dsp_x_p_latch[0:8];
+    reg  [1*47-1:0] dsp_y_p_latch[0:8];
+
+
+    //
+    // Mapping
+    //
+    wire [46:0] dsp_x_p_split[0:8];
+    wire [46:0] dsp_y_p_split[0:8];
+    
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS+1); z=z+1)
+        begin : gen_dsp_xy_p_split
+            assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
+            assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
+        end
+    endgenerate
+
+
+    //
+    // Delays
+    //
+    reg dsp_xy_ce_p_dly1 = 1'b0;
+
+    always @(posedge clk)
+        //
+        if (rst) dsp_xy_ce_p_dly1 <= 1'b0;
+        else     dsp_xy_ce_p_dly1 <= dsp_xy_ce_p;
+
+
+    //
+    // Registers
+    //
+    
+    // valid
+    reg       xy_valid_lsb = 1'b0;
+    reg       xy_aux_lsb   = 1'b0;
+    reg       xy_valid_msb = 1'b0;
+    
+    // bitmap
+    reg [7:0] xy_bitmap_lsb = {8{1'b0}};
+    reg [7:0] xy_bitmap_msb = {8{1'b0}};
+    
+    // index
+    reg [2:0] xy_index_lsb = 3'dX;
+    
+    // purge
+    reg       xy_purge_lsb = 1'b0;
+    reg       xy_purge_msb = 1'b0;
+    
+    // valid - latch
+    reg       xy_valid_latch_lsb = 1'b0;
+    
+    // aux - latch
+    reg       xy_aux_latch_lsb = 1'b0;
+    
+    // bitmap - latch
+    reg [7:0] xy_bitmap_latch_lsb = {8{1'b0}};
+    reg [7:0] xy_bitmap_latch_msb = {8{1'b0}};
+
+    // index - latch
+    reg [2:0] xy_index_latch_lsb = 3'dX;
+    
+    // purge - index
+    reg       xy_purge_latch_lsb = 1'b0;
+    reg       xy_purge_latch_msb = 1'b0;
+
+    // 
+    reg       xy_valid_lsb_adv[1:6];
+    reg       xy_valid_msb_adv[1:6];
+    reg       xy_aux_lsb_adv[1:6];
+    reg [7:0] xy_bitmap_lsb_adv[1:6];
+    reg [7:0] xy_bitmap_msb_adv[1:6];
+    reg [2:0] xy_index_lsb_adv[1:6];
+    reg [2:0] xy_index_msb_adv[1:6];
+    reg       xy_purge_lsb_adv[1:6];
+    reg       xy_purge_msb_adv[1:6];
+    
+    reg [1:0] rcmb_mode;
+       
+    always @(posedge clk)
+       //
+       if (ena)
+           //
+           case (fsm_state_next)
+               FSM_STATE_MULT_SQUARE_COL_0_BUSY:        rcmb_mode <= 2'd1;
+               FSM_STATE_MULT_TRIANGLE_COL_0_BUSY:      rcmb_mode <= 2'd2;
+               FSM_STATE_MULT_RECTANGLE_COL_0_BUSY:     rcmb_mode <= 2'd3;
+               default:                                 rcmb_mode <= 2'd0;
+           endcase
+
+               
+    integer i;
+    initial for (i=1; i<6; i=i+1) begin
+        xy_valid_lsb_adv[i] = 1'b0;
+        xy_valid_msb_adv[i] = 1'b0;
+        xy_aux_lsb_adv[i] = 1'b0;
+        xy_bitmap_lsb_adv[i] = {8{1'b0}};
+        xy_bitmap_msb_adv[i] = {8{1'b0}};
+        xy_index_lsb_adv[i] = 3'dX;
+        xy_index_msb_adv[i] = 3'dX;
+        xy_purge_lsb_adv[i] = 1'b0;
+        xy_purge_msb_adv[i] = 1'b0;
+    end
+    
+    function        calc_square_triangle_valid_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                calc_square_triangle_valid_lsb = 1'b1;
+            else
+                calc_square_triangle_valid_lsb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_square_valid_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_square_valid_lsb = calc_square_triangle_valid_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);   
+        end
+    endfunction
+
+    function        calc_triangle_valid_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_triangle_valid_lsb = calc_square_triangle_valid_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);   
+        end
+    endfunction
+    
+    function        calc_rectangle_valid_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_addr_value[7:3] == col_index_value) 
+                calc_rectangle_valid_lsb = narrow_xy_bank_value != BANK_NARROW_EXT;
+            else
+                calc_rectangle_valid_lsb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_triangle_aux_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_bank_value == BANK_NARROW_EXT)
+                calc_triangle_aux_lsb = 1'b1;
+            else
+                calc_triangle_aux_lsb = 1'b0;
+            //
+        end
+    endfunction
+    
+    function  [7:0] calc_square_triangle_bitmap_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                //
+                case (narrow_xy_addr_value[2:0])
+                    3'b000: calc_square_triangle_bitmap_lsb = 8'b00000001;
+                    3'b001: calc_square_triangle_bitmap_lsb = 8'b00000010;
+                    3'b010: calc_square_triangle_bitmap_lsb = 8'b00000100;
+                    3'b011: calc_square_triangle_bitmap_lsb = 8'b00001000;
+                    3'b100: calc_square_triangle_bitmap_lsb = 8'b00010000;
+                    3'b101: calc_square_triangle_bitmap_lsb = 8'b00100000;
+                    3'b110: calc_square_triangle_bitmap_lsb = 8'b01000000;
+                    3'b111: calc_square_triangle_bitmap_lsb = 8'b10000000;
+                endcase
+                //
+            else
+                calc_square_triangle_bitmap_lsb = {8{1'b0}};
+            //
+        end
+    endfunction
+
+    function  [7:0] calc_square_bitmap_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_square_bitmap_lsb = calc_square_triangle_bitmap_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function  [7:0] calc_triangle_bitmap_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_triangle_bitmap_lsb = calc_square_triangle_bitmap_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function  [7:0] calc_rectangle_bitmap_lsb;
+       input [4:0] col_index_value;
+       input [4:0] col_index_last_value;
+       input [1:0] narrow_xy_bank_value;
+       input [7:0] narrow_xy_addr_value;
+       begin
+           //
+           if ((narrow_xy_addr_value[7:3] == col_index_value) && (narrow_xy_bank_value != BANK_NARROW_EXT))
+               //
+               case (narrow_xy_addr_value[2:0])
+                   3'b000: calc_rectangle_bitmap_lsb = 8'b00000001;
+                   3'b001: calc_rectangle_bitmap_lsb = 8'b00000010;
+                   3'b010: calc_rectangle_bitmap_lsb = 8'b00000100;
+                   3'b011: calc_rectangle_bitmap_lsb = 8'b00001000;
+                   3'b100: calc_rectangle_bitmap_lsb = 8'b00010000;
+                   3'b101: calc_rectangle_bitmap_lsb = 8'b00100000;
+                   3'b110: calc_rectangle_bitmap_lsb = 8'b01000000;
+                   3'b111: calc_rectangle_bitmap_lsb = 8'b10000000;
+               endcase
+               //
+           else
+               calc_rectangle_bitmap_lsb = {8{1'b0}};
+           //
+        end
+    endfunction
+       
+       /*
+        * These can be simplified (the difference between square/triangle and
+        * rectangle is that the bank is checked or not). A universal function would
+        * accept a parameter that tells it whether it should check the bank or not.
+        * Let's do it later, too early to optimize now, it seems.
+        *
+        *
+        */
+       
+    function  [2:0] calc_square_triangle_index_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                //
+                case (narrow_xy_addr_value[2:0])
+                    3'b000: calc_square_triangle_index_lsb = 3'd0;
+                    3'b001: calc_square_triangle_index_lsb = 3'd1;
+                    3'b010: calc_square_triangle_index_lsb = 3'd2;
+                    3'b011: calc_square_triangle_index_lsb = 3'd3;
+                    3'b100: calc_square_triangle_index_lsb = 3'd4;
+                    3'b101: calc_square_triangle_index_lsb = 3'd5;
+                    3'b110: calc_square_triangle_index_lsb = 3'd6;
+                    3'b111: calc_square_triangle_index_lsb = 3'd7;
+                endcase
+                //
+            else
+                calc_square_triangle_index_lsb = 3'dX;
+            //
+        end
+    endfunction
+
+    function  [2:0] calc_square_index_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_square_index_lsb = calc_square_triangle_index_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function  [2:0] calc_triangle_index_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_triangle_index_lsb = calc_square_triangle_index_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function  [2:0] calc_rectangle_index_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] slim_bram_xy_bank_value;
+        input [7:0] slim_bram_xy_addr_value;
+        begin
+            //
+            if ((slim_bram_xy_addr_value[7:3] == col_index_value) && (slim_bram_xy_bank_value != BANK_NARROW_EXT))
+                //
+                case (slim_bram_xy_addr_value[2:0])
+                    3'b000: calc_rectangle_index_lsb = 3'd0;
+                    3'b001: calc_rectangle_index_lsb = 3'd1;
+                    3'b010: calc_rectangle_index_lsb = 3'd2;
+                    3'b011: calc_rectangle_index_lsb = 3'd3;
+                    3'b100: calc_rectangle_index_lsb = 3'd4;
+                    3'b101: calc_rectangle_index_lsb = 3'd5;
+                    3'b110: calc_rectangle_index_lsb = 3'd6;
+                    3'b111: calc_rectangle_index_lsb = 3'd7;
+                endcase
+                //
+            else
+                calc_rectangle_index_lsb = 3'dX;
+            //
+        end
+    endfunction
+    
+    function        calc_square_rectangle_purge_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            //
+            if (narrow_xy_addr_value[7:3] == col_index_value)
+                calc_square_rectangle_purge_lsb = narrow_xy_addr_value[7:3] == col_index_last_value;
+            else
+                calc_square_rectangle_purge_lsb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_square_purge_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_square_purge_lsb = calc_square_rectangle_purge_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function        calc_rectangle_purge_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        begin
+            calc_rectangle_purge_lsb = calc_square_rectangle_purge_lsb(col_index_value, col_index_last_value, narrow_xy_bank_value, narrow_xy_addr_value);
+        end
+    endfunction
+
+    function        calc_square_valid_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (narrow_xy_addr_value == index_last_value)
+                calc_square_valid_msb = 1'b1;
+            else
+                calc_square_valid_msb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_rectangle_valid_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if ((narrow_xy_addr_value == 8'd1) && (narrow_xy_bank_value == BANK_NARROW_EXT))
+                calc_rectangle_valid_msb = 1'b1;
+            else
+                calc_rectangle_valid_msb = 1'b0;
+            //
+        end
+    endfunction
+    
+    function  [7:0] calc_square_bitmap_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (narrow_xy_addr_value == index_last_value) begin
+                calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
+                calc_square_bitmap_msb[6:0] = 7'b1111111;
+            end else
+                calc_square_bitmap_msb[7:0] = 8'b00000000;
+            //
+        end
+    endfunction
+
+    function  [7:0] calc_rectangle_bitmap_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if ((narrow_xy_addr_value == 8'd1) && (narrow_xy_bank_value == BANK_NARROW_EXT)) begin
+                calc_rectangle_bitmap_msb[7:0] = 8'b11111111;
+            end else
+                calc_rectangle_bitmap_msb[7:0] = 8'b00000000;
+            //
+        end
+    endfunction
+
+    function        calc_square_purge_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (narrow_xy_addr_value == index_last_value)
+                calc_square_purge_msb = col_index_value == col_index_last_value;
+            else
+                calc_square_purge_msb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_rectangle_purge_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [1:0] narrow_xy_bank_value;
+        input [7:0] narrow_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if ((narrow_xy_addr_value == 8'd1) && (narrow_xy_bank_value == BANK_NARROW_EXT))
+                calc_rectangle_purge_msb = col_index_value == col_index_last_value;
+            else
+                calc_rectangle_purge_msb = 1'b0;
+            //
+        end
+    endfunction
+
+    
+    reg         rcmb_xy_lsb_ce = 1'b0;
+    reg         rcmb_xy_lsb_ce_aux;
+    reg  [ 2:0] rcmb_xy_lsb_ce_purge = 3'b000;
+    wire        rcmb_xy_lsb_ce_combined = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0];
+    reg         rcmb_xy_lsb_clr;
+
+    reg  [46:0] rcmb_x_lsb_din;
+    reg  [46:0] rcmb_y_lsb_din;
+    wire [15:0] rcmb_x_lsb_dout;
+    wire [15:0] rcmb_y_lsb_dout;
+
+    reg         rcmb_xy_msb_ce = 1'b0;
+    reg  [ 1:0] rcmb_xy_msb_ce_purge = 2'b00;
+    wire        rcmb_xy_msb_ce_combined = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0];
+    reg         rcmb_xy_msb_clr;
+    
+    reg  [46:0] rcmb_x_msb_din;
+    reg  [46:0] rcmb_y_msb_din;
+    wire [15:0] rcmb_x_msb_dout;
+    wire [15:0] rcmb_y_msb_dout;
+    
+    modexpng_recombinator_cell recomb_x_lsb
+    (
+        .clk    (clk),
+        .ce     (rcmb_xy_lsb_ce_combined),
+        .clr    (rcmb_xy_lsb_clr),
+        .din    (rcmb_x_lsb_din),
+        .dout   (rcmb_x_lsb_dout)
+    );
+    modexpng_recombinator_cell recomb_y_lsb
+    (
+        .clk    (clk),
+        .ce     (rcmb_xy_lsb_ce_combined),
+        .clr    (rcmb_xy_lsb_clr),
+        .din    (rcmb_y_lsb_din),
+        .dout   (rcmb_y_lsb_dout)
+    );
+
+    modexpng_recombinator_cell recomb_x_msb
+    (
+        .clk    (clk),
+        .ce     (rcmb_xy_msb_ce_combined),
+        .clr    (rcmb_xy_msb_clr),
+        .din    (rcmb_x_msb_din),
+        .dout   (rcmb_x_msb_dout)
+    );
+    
+    modexpng_recombinator_cell recomb_y_msb
+    (
+        .clk    (clk),
+        .ce     (rcmb_xy_msb_ce_combined),
+        .clr    (rcmb_xy_msb_clr),
+        .din    (rcmb_y_msb_din),
+        .dout   (rcmb_y_msb_dout)
+    );
+
+    always @(posedge clk) begin
+        //
+        rcmb_xy_lsb_ce <= xy_valid_latch_lsb;
+        rcmb_xy_lsb_ce_aux <= xy_aux_latch_lsb;
+        rcmb_xy_msb_ce <= xy_bitmap_latch_msb[0];
+        //
+        if (xy_purge_latch_lsb)
+            rcmb_xy_lsb_ce_purge <= 3'b111;
+        else
+            rcmb_xy_lsb_ce_purge <= {1'b0, rcmb_xy_lsb_ce_purge[2:1]};
+        //
+        if (xy_purge_latch_msb && xy_bitmap_latch_msb[0] && !xy_bitmap_latch_msb[1])
+            rcmb_xy_msb_ce_purge = 2'b11;
+        else
+            rcmb_xy_msb_ce_purge <= {1'b0, rcmb_xy_msb_ce_purge[1]};
+        //
+    end
+
+
+    always @(posedge clk)
+        //
+        if (ena) begin
+            rcmb_xy_lsb_clr <= 1'b1;
+            rcmb_xy_msb_clr <= 1'b1;
+        end else begin
+            if (rcmb_xy_lsb_ce) rcmb_xy_lsb_clr <= 1'b0;
+            if (rcmb_xy_msb_ce) rcmb_xy_msb_clr <= 1'b0;
+        end
+
+    always @(posedge clk)
+        //
+        if (xy_valid_latch_lsb) begin
+            rcmb_x_lsb_din <= dsp_x_p_latch[xy_index_latch_lsb];
+            rcmb_y_lsb_din <= dsp_y_p_latch[xy_index_latch_lsb];
+        end else if (xy_aux_latch_lsb) begin
+            rcmb_x_lsb_din <= dsp_x_p_latch[8];
+            rcmb_y_lsb_din <= dsp_y_p_latch[8];
+        end else begin
+            rcmb_x_lsb_din <= {47{1'b0}};
+            rcmb_y_lsb_din <= {47{1'b0}};
+        end
+
+    always @(posedge clk)
+        //
+        if (xy_bitmap_latch_msb[0]) begin
+            rcmb_x_msb_din <= dsp_x_p_latch[0];
+            rcmb_y_msb_din <= dsp_y_p_latch[0];
+        end else begin
+            rcmb_x_msb_din <= {47{1'b0}};
+            rcmb_y_msb_din <= {47{1'b0}};
+        end
+
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+                //
+                xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_aux_lsb_adv   [6] <= 1'b0;
+                xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                //
+                xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                //
+            end
+            //
+            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
+            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
+            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: begin
+                //
+                xy_valid_lsb_adv [6] <= calc_triangle_valid_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_aux_lsb_adv   [6] <= calc_triangle_aux_lsb   (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_bitmap_lsb_adv[6] <= calc_triangle_bitmap_lsb(col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_index_lsb_adv [6] <= calc_triangle_index_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_purge_lsb_adv [6] <= 1'b0;
+                //
+                xy_valid_msb_adv [6] <= 1'b0;
+                xy_bitmap_msb_adv[6] <= {8{1'b0}};
+                xy_purge_msb_adv [6] <= 1'b0;
+                //
+            end
+            //
+            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
+            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
+            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
+            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: begin
+                //
+                xy_valid_lsb_adv [6] <= calc_rectangle_valid_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_aux_lsb_adv   [6] <= 1'b0;
+                xy_bitmap_lsb_adv[6] <= calc_rectangle_bitmap_lsb(col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_index_lsb_adv [6] <= calc_rectangle_index_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                xy_purge_lsb_adv [6] <= calc_rectangle_purge_lsb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr);
+                //
+                xy_valid_msb_adv [6] <= calc_rectangle_valid_msb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                xy_bitmap_msb_adv[6] <= calc_rectangle_bitmap_msb(col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                xy_purge_msb_adv [6] <= calc_rectangle_purge_msb (col_index, col_index_last, rd_narrow_xy_bank, rd_narrow_xy_addr, word_index_last);
+                //
+            end
+            //
+            default: begin
+                //
+                xy_valid_lsb_adv [6] <= 1'b0;
+                xy_aux_lsb_adv   [6] <= 1'b0;
+                xy_bitmap_lsb_adv[6] <= {8{1'b0}};
+                xy_index_lsb_adv [6] <= 3'dX;
+                xy_purge_lsb_adv [6] <= 1'b0;
+                //
+                xy_valid_msb_adv [6] <= 1'b0;
+                xy_bitmap_msb_adv[6] <= {8{1'b0}};
+                xy_purge_msb_adv [6] <= 1'b0;
+                //
+            end
+            //
+        endcase
+
+
+    always @(posedge clk) begin
+        //
+        xy_valid_lsb  <= xy_valid_lsb_adv [1];
+        xy_aux_lsb    <= xy_aux_lsb_adv   [1];
+        xy_bitmap_lsb <= xy_bitmap_lsb_adv[1];
+        xy_index_lsb  <= xy_index_lsb_adv [1];
+        xy_purge_lsb  <= xy_purge_lsb_adv [1];
+        //
+        xy_valid_latch_lsb  <= xy_valid_lsb;
+        xy_aux_latch_lsb    <= xy_aux_lsb;
+        xy_bitmap_latch_lsb <= xy_bitmap_lsb;
+        xy_index_latch_lsb  <= xy_index_lsb;
+        xy_purge_latch_lsb  <= xy_purge_lsb;
+        //
+        xy_valid_msb  <= xy_valid_msb_adv[1];
+        xy_bitmap_msb <= xy_bitmap_msb_adv[1];
+        xy_purge_msb  <= xy_purge_msb_adv[1];
+        //
+        if (xy_valid_msb) begin
+            xy_bitmap_latch_msb <= xy_bitmap_msb;
+            xy_purge_latch_msb  <= xy_purge_msb;
+        end else begin
+            xy_bitmap_latch_msb <= {1'b0, xy_bitmap_latch_msb[7:1]};
+        end
+        //
+        //
+        for (i=1; i<6; i=i+1) begin
+            xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
+            xy_aux_lsb_adv   [i] <= xy_aux_lsb_adv   [i+1];
+            xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
+            xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
+            xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
+            //
+            xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
+            xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
+            xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
+        end
+        //
+    end
+
+    always @(posedge clk)
+        //
+        if (xy_bitmap_latch_msb[1])   // only shift 7 times
+            //
+            for (i=0; i<8; i=i+1)
+                //            
+                if (i < 7) begin
+                    dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
+                    dsp_y_p_latch[i] <= dsp_y_p_latch[i+1];
+                end else begin
+                    dsp_x_p_latch[i] <= {47{1'bX}};
+                    dsp_y_p_latch[i] <= {47{1'bX}};
+                end
+            //
+        else if (dsp_xy_ce_p_dly1) begin
+            //
+            for (i=0; i<8; i=i+1)
+                //
+                if (xy_bitmap_lsb[i]) begin
+                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
+                    dsp_y_p_latch[i] <= dsp_y_p_split[i];
+                end else if (xy_valid_msb && xy_bitmap_msb[i]) begin
+                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
+                    dsp_y_p_latch[i] <= dsp_y_p_split[i];
+                end
+            //
+            if (xy_aux_lsb) begin
+                dsp_x_p_latch[8] <= dsp_x_p_split[8];
+                dsp_y_p_latch[8] <= dsp_y_p_split[8];
+            end
+            //
+        end
+
+    reg rcmb_xy_lsb_valid = 1'b0;
+    reg rcmb_xy_msb_valid = 1'b0;
+
+    always @(posedge clk)
+        //
+        if (rst) begin
+            rcmb_xy_lsb_valid <= 1'b0;
+            rcmb_xy_msb_valid <= 1'b0;        
+        end else begin
+            rcmb_xy_lsb_valid <= rcmb_xy_lsb_ce_combined;
+            rcmb_xy_msb_valid <= rcmb_xy_msb_ce_combined;
+        end        
+
+
+    reg [ 1:0] wide_xy_bank;
+    reg [ 7:0] wide_xy_addr;
+    reg [17:0] wide_x_dout;
+    reg [17:0] wide_y_dout;
+    reg        wide_xy_valid = 1'b0;
+
+    reg [ 1:0] narrow_xy_bank;
+    reg [ 7:0] narrow_xy_addr;
+    reg [17:0] narrow_x_dout;
+    reg [17:0] narrow_y_dout;
+    reg        narrow_xy_valid = 1'b0;
+
+    reg [ 1:0] rdct_xy_bank;
+    reg [ 7:0] rdct_xy_addr;
+    reg [17:0] rdct_x_dout;
+    reg [17:0] rdct_y_dout;
+    reg        rdct_xy_valid = 1'b0;
+
+    reg [ 7:0] cnt_lsb;
+    reg [ 7:0] cnt_msb;
+    
+    reg        cnt_lsb_wrapped;
+    reg        cnt_msb_wrapped;
+
+    reg [31:0] rcmb_xy_msb_carry_0;
+    reg [31:0] rcmb_xy_msb_carry_1;
+    
+    reg [31:0] rcmb_xy_msb_delay_0;
+    reg [31:0] rcmb_xy_msb_delay_1;
+    reg [31:0] rcmb_xy_msb_delay_2;
+    
+    reg [ 7:0] rcmb_msb_cnt_delay_0 = 8'd0;
+    reg [ 7:0] rcmb_msb_cnt_delay_1 = 8'd0;
+    reg [ 7:0] rcmb_msb_cnt_delay_2 = 8'd0;
+
+    reg        rcmb_msb_flag_delay_0;
+    reg        rcmb_msb_flag_delay_1;
+    reg        rcmb_msb_flag_delay_2;
+    
+    assign rcmb_wide_xy_bank  = wide_xy_bank;
+    assign rcmb_wide_xy_addr  = wide_xy_addr;
+    assign rcmb_wide_x_dout   = wide_x_dout;
+    assign rcmb_wide_y_dout   = wide_y_dout;
+    assign rcmb_wide_xy_valid = wide_xy_valid;
+
+    assign rcmb_narrow_xy_bank  = narrow_xy_bank;
+    assign rcmb_narrow_xy_addr  = narrow_xy_addr;
+    assign rcmb_narrow_x_dout   = narrow_x_dout;
+    assign rcmb_narrow_y_dout   = narrow_y_dout;
+    assign rcmb_narrow_xy_valid = narrow_xy_valid;
+
+    assign rdct_narrow_xy_bank  = rdct_xy_bank;
+    assign rdct_narrow_xy_addr  = rdct_xy_addr;
+    assign rdct_narrow_x_dout   = rdct_x_dout;
+    assign rdct_narrow_y_dout   = rdct_y_dout;
+    assign rdct_narrow_xy_valid = rdct_xy_valid;
+    
+    reg rdy_reg = 1'b1;
+    reg rdy_adv = 1'b1;
+    
+    assign rdy = rdy_reg;
     
-    wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here
-    wire [15:0] din_y = din[31:16];
-    wire [15:0] din_x = din[15: 0];
     
     always @(posedge clk)
         //
-        if (ce) begin
-            z <= din_z;
-            y <= clr ? {1'b0, din_y}  : {1'b0, din_y} + {2'b00, z};
-            x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]};
-            //w <= clr ? {16{1'bX}}     : x[15:0];        
+        if (ena) rdy_reg <= 1'b0;
+        else     rdy_reg <= rdy_adv;
+            
+    task advance_rcmb_msb_delay;
+        input [15:0] dout_x;
+        input [15:0] dout_y;
+        input [ 7:0] cnt;
+        input        flag;
+        begin
+            //
+            rcmb_xy_msb_delay_0 <= {dout_y, dout_x};
+            rcmb_xy_msb_delay_1 <= rcmb_xy_msb_delay_0;
+            rcmb_xy_msb_delay_2 <= rcmb_xy_msb_delay_1;
+            //
+            rcmb_msb_cnt_delay_0 <= cnt;
+            rcmb_msb_cnt_delay_1 <= rcmb_msb_cnt_delay_0;
+            rcmb_msb_cnt_delay_2 <= rcmb_msb_cnt_delay_1;
+            //
+            rcmb_msb_flag_delay_0 <= flag;
+            rcmb_msb_flag_delay_1 <= rcmb_msb_flag_delay_0;
+            rcmb_msb_flag_delay_2 <= rcmb_msb_flag_delay_1;
+            //
+        end
+    endtask
+         
+    task shift_rcmb_msb_carry;
+        input [15:0] dout_x;
+        input [15:0] dout_y;
+        begin
+            rcmb_xy_msb_carry_0 <= {dout_y, dout_x};
+            rcmb_xy_msb_carry_1 <= rcmb_xy_msb_carry_0;
+        end
+    endtask
+    
+    task _update_wide;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        input        valid;
+        begin
+            wide_xy_bank       <= bank;
+            wide_xy_addr       <= addr;
+            wide_x_dout        <= dout_x;
+            wide_y_dout        <= dout_y;
+            wide_xy_valid <= valid;
+        end
+    endtask
+    
+    task _update_narrow;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        input        valid;
+        begin
+            narrow_xy_bank       <= bank;
+            narrow_xy_addr       <= addr;
+            narrow_x_dout        <= dout_x;
+            narrow_y_dout        <= dout_y;
+            narrow_xy_valid <= valid;
         end
+    endtask
+
+    task _update_rdct;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        input        valid;
+        begin
+            rdct_xy_bank       <= bank;
+            rdct_xy_addr       <= addr;
+            rdct_x_dout        <= dout_x;
+            rdct_y_dout        <= dout_y;
+            rdct_xy_valid <= valid;
+        end
+    endtask
+            
+    task set_wide;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        begin
+            _update_wide(bank, addr, dout_x, dout_y, 1'b1);
+        end
+    endtask
+    
+    task set_narrow;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        begin
+            _update_narrow(bank, addr, dout_x, dout_y, 1'b1);
+        end
+    endtask
+    
+    task set_rdct;
+        input [ 1:0] bank;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        begin
+            _update_rdct(bank, addr, dout_x, dout_y, 1'b1);
+        end
+    endtask
+    
+    task clear_wide;
+        begin
+            _update_wide(2'bXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
+        end
+    endtask
+
+    task clear_narrow;
+        begin
+            _update_narrow(2'bXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
+        end
+    endtask
+
+    task clear_rdct;
+        begin
+            _update_rdct(2'bXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
+        end
+    endtask
+    
+    task _set_cnt_lsb;
+        input [7:0] cnt;
+        input       wrapped;
+        begin
+            cnt_lsb <= cnt;
+            cnt_lsb_wrapped <= wrapped;
+        end
+    endtask
+    
+    task _set_cnt_msb;
+        input [7:0] cnt;
+        input       wrapped;
+        begin
+            cnt_msb <= cnt;
+            cnt_msb_wrapped <= wrapped;
+        end
+    endtask    
+
+    task inc_cnt_lsb;
+        begin
+            if (cnt_lsb == word_index_last)
+                _set_cnt_lsb(8'd0, 1'b1);
+            else
+                _set_cnt_lsb(cnt_lsb + 1'b1, cnt_lsb_wrapped);
+        end
+    endtask
+    
+    task inc_cnt_both;
+        begin
+            inc_cnt_lsb;
+            inc_cnt_msb;
+        end
+    endtask
+    
+    task inc_cnt_msb;
+        begin
+            if (cnt_msb == word_index_last)
+                _set_cnt_msb(8'd0, 1'b1);
+            else
+                _set_cnt_msb(cnt_msb + 1'b1, cnt_msb_wrapped);
+        end
+    endtask
+    
+    task clr_cnt_lsb;
+        begin
+            _set_cnt_lsb(8'd0, 1'b0);
+        end
+    endtask
+    
+    task clr_cnt_msb;
+        begin
+            _set_cnt_msb(8'd0, 1'b0);
+        end
+    endtask
+    
+   
+
+    wire [1:0] rcmb_xy_valid = {rcmb_xy_msb_valid, rcmb_xy_lsb_valid}; 
+    
+    always @(posedge clk)
+        //
+        if (ena) begin
+            clr_cnt_lsb();
+            clr_cnt_msb();
+        end else if (!rdy)
+            //
+            case (rcmb_mode)
+                2'd1: recombine_square();
+                2'd2: recombine_triangle();
+                2'd3: recombine_rectangle();
+            endcase
+           
+    wire [17:0] rcmb_x_lsb_dout_pad = {2'b00, rcmb_x_lsb_dout};
+    wire [17:0] rcmb_y_lsb_dout_pad = {2'b00, rcmb_y_lsb_dout};
+
+    wire [17:0] rcmb_x_msb_dout_pad = {2'b00, rcmb_x_msb_dout};
+    wire [17:0] rcmb_y_msb_dout_pad = {2'b00, rcmb_y_msb_dout};
+    
+    wire [17:0] rcmb_x_msb_delay_2_pad = {2'b00, rcmb_xy_msb_delay_2[15: 0]};
+    wire [17:0] rcmb_y_msb_delay_2_pad = {2'b00, rcmb_xy_msb_delay_2[31:16]};
+
+    wire [17:0] rcmb_x_lsb_msb_carry_1_pad = {1'b0, {1'b0, rcmb_x_lsb_dout} + {1'b0, rcmb_xy_msb_carry_1[15: 0]}};
+    wire [17:0] rcmb_y_lsb_msb_carry_1_pad = {1'b0, {1'b0, rcmb_y_lsb_dout} + {1'b0, rcmb_xy_msb_carry_1[31:16]}};
+           
+           
+    task recombine_square;
+        //
+        begin
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b01: inc_cnt_lsb; 
+                2'b10: inc_cnt_msb;
+                2'b11: inc_cnt_both;
+                //
+            endcase            
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b00: if (rcmb_msb_flag_delay_2) set_wide(BANK_WIDE_ABH, rcmb_msb_cnt_delay_2, rcmb_x_msb_delay_2_pad, rcmb_y_msb_delay_2_pad);
+                       else                       clear_wide;
+                //
+                2'b01:                            set_wide(BANK_WIDE_ABL, cnt_lsb, rcmb_x_lsb_dout_pad, rcmb_y_lsb_dout_pad);
+                //
+                2'b10: if (cnt_msb < 8'd2)        clear_wide;                        
+                       else                       set_wide(BANK_WIDE_ABH, cnt_msb, rcmb_x_msb_dout_pad, rcmb_y_msb_dout_pad);
+                //
+                2'b11: if (cnt_lsb_wrapped)       set_wide(BANK_WIDE_ABH, cnt_lsb, rcmb_x_lsb_msb_carry_1_pad, rcmb_y_lsb_msb_carry_1_pad); 
+                       else                       set_wide(BANK_WIDE_ABL, cnt_lsb, rcmb_x_lsb_dout_pad,        rcmb_y_lsb_dout_pad);
+                //
+            endcase            
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b00: if (rcmb_msb_flag_delay_2) advance_rcmb_msb_delay(16'hXXXX, 16'hXXXX, 8'd0, 1'b0);
+                2'b10: if (cnt_msb < 8'd2)        shift_rcmb_msb_carry(rcmb_x_msb_dout, rcmb_y_msb_dout);
+                //
+                2'b11: begin                      advance_rcmb_msb_delay(rcmb_x_msb_dout, rcmb_y_msb_dout, cnt_msb, 1'b1);
+                       if (cnt_lsb_wrapped)       shift_rcmb_msb_carry({16{1'bX}}, {16{1'bX}});
+                       end
+                //
+            endcase
+            //        
+        end
+        //
+    endtask
+    
+    
+    task recombine_triangle;
+        //
+        begin
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b01: inc_cnt_lsb(); 
+               //
+            endcase            
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b00:                        clear_narrow;
+                2'b01:  if (!cnt_lsb_wrapped) set_narrow(BANK_NARROW_Q,   cnt_lsb, rcmb_x_lsb_dout_pad, rcmb_y_lsb_dout_pad); 
+                        else                  set_narrow(BANK_NARROW_EXT, 8'd1,    rcmb_x_lsb_dout_pad, rcmb_y_lsb_dout_pad);
+                2'b10:                        clear_narrow;
+                2'b11:                        clear_narrow;
+                //
+            endcase
+            //        
+        end
+        //
+    endtask
+
+
+    task recombine_rectangle;
+        //
+        begin
+            //
+            case (rcmb_xy_valid)
+                //
+                2'b01:  inc_cnt_lsb; 
+                2'b10:  inc_cnt_msb;
+                2'b11:  inc_cnt_both;
+                //
+            endcase
+//            //
+            case (rcmb_xy_valid)
+//                //
+                2'b00:  if (rcmb_msb_flag_delay_2)  set_rdct(BANK_RCMB_MH, rcmb_msb_cnt_delay_2, rcmb_x_msb_delay_2_pad, rcmb_y_msb_delay_2_pad);
+                        else                        clear_rdct;
+                2'b01:                              set_rdct(BANK_RCMB_ML, cnt_lsb, rcmb_x_lsb_dout_pad, rcmb_y_lsb_dout_pad); 
+                2'b10:  if (!cnt_msb_wrapped) begin 
+                            if (cnt_msb < 8'd2)     clear_rdct;                        
+                            else                    set_rdct(BANK_RCMB_MH, cnt_msb, rcmb_x_msb_dout_pad, rcmb_y_msb_dout_pad);
+                        end else                    set_rdct(BANK_RCMB_EXT, 8'd0, rcmb_x_msb_dout_pad, rcmb_y_msb_dout_pad);
+                            
+                2'b11:  set_rdct(BANK_RCMB_MH, cnt_lsb, rcmb_x_lsb_msb_carry_1_pad, rcmb_y_lsb_msb_carry_1_pad); 
+//                //
+            endcase            
+//            //
+            case (rcmb_xy_valid)
+//                //
+                2'b00:  if (rcmb_msb_flag_delay_2)  advance_rcmb_msb_delay(16'hXXXX, 16'hXXXX, 8'd0, 1'b0);
+                2'b10:  begin 
+                            if ((cnt_msb < 8'd2) && !cnt_msb_wrapped) shift_rcmb_msb_carry(rcmb_x_msb_dout, rcmb_y_msb_dout);
+                            if (cnt_msb_wrapped) advance_rcmb_msb_delay(16'hXXXX, 16'hXXXX, 8'd0, 1'b0);
+                        end
+//                //
+                2'b11:  begin  advance_rcmb_msb_delay(rcmb_x_msb_dout, rcmb_y_msb_dout, cnt_msb, 1'b1);
+                                   shift_rcmb_msb_carry({16{1'bX}}, {16{1'bX}});
+                        end
+//                //
+            endcase
+            //
+        end
+        //
+    endtask
+    
+    
+    always @(posedge clk)
+        //
+        if (ena) begin
+            rdy_adv <= 1'b0;
+        end else if (!rdy_reg) begin
+            //
+            case (rcmb_mode)
+                //
+                2'd1:   case (rcmb_xy_valid)
+                            //
+                            2'b00: begin
+                                //
+                                if (rcmb_msb_flag_delay_2) begin
+                                    //
+                                    rdy_adv <= ~rcmb_msb_flag_delay_1;
+                                    //
+                                end
+                                //
+                            end
+                            //
+                        endcase
+                //
+                2'd2:   case (rcmb_xy_valid)
+                            //
+                            2'b01: rdy_adv <= cnt_lsb_wrapped;                                //
+                            //
+                        endcase
+                //
+                2'd3: case (rcmb_xy_valid)
+                                            //
+                                            2'b00: begin
+                                                //
+                                                if (rcmb_msb_flag_delay_2) begin
+                                                    //
+                                                    rdy_adv <= ~rcmb_msb_flag_delay_1;
+                                                    //
+                                                end
+                                                //
+                                            end
+                                            //
+                                        endcase
+                //
+            endcase
+            //        
+        end
+
+
     
+        // add ready for mode=3
 endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_cell.v
similarity index 95%
copy from rtl/modexpng_recombinator_block.v
copy to rtl/modexpng_recombinator_cell.v
index efe0ac5..1ecf56a 100644
--- a/rtl/modexpng_recombinator_block.v
+++ b/rtl/modexpng_recombinator_cell.v
@@ -1,4 +1,4 @@
-module modexpng_recombinator_block
+module modexpng_recombinator_cell
 (
     clk,
     ce, clr,
diff --git a/rtl/modexpng_reductor.v b/rtl/modexpng_reductor.v
new file mode 100644
index 0000000..0f5e461
--- /dev/null
+++ b/rtl/modexpng_reductor.v
@@ -0,0 +1,270 @@
+module modexpng_reductor
+(
+    clk, rst,
+    ena, rdy,
+    //fsm_state_next,
+    word_index_last,
+    //dsp_xy_ce_p,
+    //dsp_x_p, dsp_y_p,
+    //col_index, col_index_last,
+    rd_wide_xy_addr_aux, rd_wide_xy_bank_aux, rd_wide_x_dout_aux, rd_wide_y_dout_aux,
+    //rcmb_wide_xy_bank,   rcmb_wide_xy_addr,   rcmb_wide_x_dout,   rcmb_wide_y_dout,   rcmb_wide_xy_valid,
+    rcmb_final_xy_bank, rcmb_final_xy_addr, rcmb_final_x_dout, rcmb_final_y_dout, rcmb_final_xy_valid,
+                        rdct_final_xy_addr, rdct_final_x_dout, rdct_final_y_dout, rdct_final_xy_valid
+);
+
+
+    //
+    // Headers
+    //
+    //`include "../rtl_1/modexpng_mmm_fsm.vh"
+    `include "../rtl_1/modexpng_parameters_old.vh"
+    //`include "../rtl_1/modexpng_parameters_x8.vh"
+
+
+    input                        clk;
+    input                        rst;
+    input                        ena;
+    output                       rdy;
+    /*
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;*/
+    input [7:0]                  word_index_last;/*
+    input                        dsp_xy_ce_p;
+    *//*
+    input  [9*47-1:0] dsp_x_p;
+    input  [9*47-1:0] dsp_y_p;
+    input  [     4:0] col_index;
+    input  [     4:0] col_index_last;
+    *//*
+    input  [     7:0] rd_narrow_xy_addr;
+    input  [     1:0] rd_narrow_xy_bank;
+    */
+    input  [     1:0] rd_wide_xy_bank_aux;
+    input  [     7:0] rd_wide_xy_addr_aux;
+    input  [    17:0] rd_wide_x_dout_aux;
+    input  [    17:0] rd_wide_y_dout_aux;
+    //
+    input  [     1:0] rcmb_final_xy_bank;
+    input  [     7:0] rcmb_final_xy_addr;
+    input  [    17:0] rcmb_final_x_dout;
+    input  [    17:0] rcmb_final_y_dout;
+    input             rcmb_final_xy_valid;
+
+    output [     7:0] rdct_final_xy_addr;
+    output [    17:0] rdct_final_x_dout;
+    output [    17:0] rdct_final_y_dout;
+    output            rdct_final_xy_valid;
+
+
+    //
+    // Ready
+    //
+    reg rdy_reg  = 1'b1;
+    reg busy_now = 1'b0;
+
+    assign rdy = rdy_reg;
+    
+    always @(posedge clk)
+        //
+        if (rst) rdy_reg <= 1'b1;
+        else begin
+            if (rdy && ena) rdy_reg <= 1'b0;
+            if (!rdy && !busy_now) rdy_reg <= 1'b1;
+        end
+        
+        
+
+    //
+    // Pipeline (Delay Match)
+    //
+    reg rcmb_xy_valid_dly1 = 1'b0;
+    reg rcmb_xy_valid_dly2 = 1'b0;
+    reg rcmb_xy_valid_dly3 = 1'b0;
+
+    reg [2:0] rcmb_xy_bank_dly1;
+    reg [2:0] rcmb_xy_bank_dly2;
+    reg [2:0] rcmb_xy_bank_dly3;
+
+    reg [7:0] rcmb_xy_addr_dly1;
+    reg [7:0] rcmb_xy_addr_dly2;
+    reg [7:0] rcmb_xy_addr_dly3;
+
+    reg [17:0] rcmb_x_dout_dly1;
+    reg [17:0] rcmb_x_dout_dly2;
+    reg [17:0] rcmb_x_dout_dly3;
+
+    reg [17:0] rcmb_y_dout_dly1;
+    reg [17:0] rcmb_y_dout_dly2;
+    reg [17:0] rcmb_y_dout_dly3;
+    
+    always @(posedge clk) 
+        //
+        if (rst) begin
+            rcmb_xy_valid_dly1 <= 1'b0;
+            rcmb_xy_valid_dly2 <= 1'b0;
+            rcmb_xy_valid_dly3 <= 1'b0;
+        end else begin
+            rcmb_xy_valid_dly1 <= rcmb_final_xy_valid;
+            rcmb_xy_valid_dly2 <= rcmb_xy_valid_dly1;
+            rcmb_xy_valid_dly3 <= rcmb_xy_valid_dly2;        
+        end
+    
+    
+    always @(posedge clk) begin
+        //
+        if (rcmb_final_xy_valid) begin
+            rcmb_xy_bank_dly1 <= rcmb_final_xy_bank;
+            rcmb_xy_addr_dly1 <= rcmb_final_xy_addr;
+            rcmb_x_dout_dly1  <= rcmb_final_x_dout;
+            rcmb_y_dout_dly1  <= rcmb_final_y_dout;
+        end
+        //
+        if (rcmb_xy_valid_dly1) begin
+            rcmb_xy_bank_dly2 <= rcmb_xy_bank_dly1;
+            rcmb_xy_addr_dly2 <= rcmb_xy_addr_dly1;
+            rcmb_x_dout_dly2  <= rcmb_x_dout_dly1;
+            rcmb_y_dout_dly2  <= rcmb_y_dout_dly1;
+        end
+        //
+        if (rcmb_xy_valid_dly2) begin
+            rcmb_xy_bank_dly3 <= rcmb_xy_bank_dly2;
+            rcmb_xy_addr_dly3 <= rcmb_xy_addr_dly2;
+            rcmb_x_dout_dly3  <= rcmb_x_dout_dly2;
+            rcmb_y_dout_dly3  <= rcmb_y_dout_dly2;
+        end
+        //
+    end
+        
+    
+    reg [ 1:0] rcmb_x_lsb_carry;
+    reg [15:0] rcmb_x_lsb_dummy;
+    reg [17:0] rcmb_x_lsb_dout;
+
+    reg [ 1:0] rcmb_y_lsb_carry;
+    reg [15:0] rcmb_y_lsb_dummy;
+    reg [17:0] rcmb_y_lsb_dout;
+
+    //reg [17:0] reductor_fat_bram_x_msb_dout;
+    //reg        reductor_fat_bram_x_msb_dout_valid = 1'b0;
+    //reg [ 7:0] reductor_fat_bram_x_msb_addr;
+
+    //
+    // Carry Computation
+    //
+    always @(posedge clk)
+        //
+        if (ena) begin
+            rcmb_x_lsb_carry <= 2'b00;
+            rcmb_y_lsb_carry <= 2'b00;
+        end else if (rcmb_xy_valid_dly3)
+            //
+            case (rcmb_xy_bank_dly3)    
+        
+                BANK_RCMB_ML: begin
+                    {rcmb_x_lsb_carry, rcmb_x_lsb_dummy} <= rcmb_x_dout_dly3 + rd_wide_x_dout_aux + rcmb_x_lsb_carry;
+                    {rcmb_y_lsb_carry, rcmb_y_lsb_dummy} <= rcmb_y_dout_dly3 + rd_wide_y_dout_aux + rcmb_y_lsb_carry;
+                end
+                    
+                BANK_RCMB_MH:
+                    if (rcmb_xy_addr_dly3 == 8'd0) begin
+                        {rcmb_x_lsb_carry, rcmb_x_lsb_dummy} <= rcmb_x_dout_dly3 + rd_wide_x_dout_aux + rcmb_x_lsb_carry;
+                        {rcmb_y_lsb_carry, rcmb_y_lsb_dummy} <= rcmb_y_dout_dly3 + rd_wide_y_dout_aux + rcmb_y_lsb_carry;
+                    end
+                    
+            endcase
+
+
+    //
+    // Reduction
+    //
+    reg [     7:0] rdct_xy_addr;
+    reg [    17:0] rdct_x_dout;
+    reg [    17:0] rdct_y_dout;
+    reg            rdct_xy_valid = 1'b0;
+
+    assign rdct_final_xy_addr  = rdct_xy_addr;
+    assign rdct_final_x_dout   = rdct_x_dout;
+    assign rdct_final_y_dout   = rdct_y_dout;
+    assign rdct_final_xy_valid = rdct_xy_valid;
+
+    task _update_rdct;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        input        valid;
+        begin
+            rdct_xy_addr  <= addr;
+            rdct_x_dout   <= dout_x;
+            rdct_y_dout   <= dout_y;
+            rdct_xy_valid <= valid;
+        end
+    endtask
+    
+    task set_rdct;
+        input [ 7:0] addr;
+        input [17:0] dout_x;
+        input [17:0] dout_y;
+        begin
+            _update_rdct(addr, dout_x, dout_y, 1'b1);
+        end
+    endtask
+    
+    task clear_rdct;
+        begin
+            _update_rdct(8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0);
+        end
+    endtask
+    
+    
+    //
+    //
+    //
+    wire [17:0] sum_rdct_x = rcmb_x_dout_dly3 + rd_wide_x_dout_aux;
+    wire [17:0] sum_rdct_y = rcmb_y_dout_dly3 + rd_wide_y_dout_aux;
+    
+    wire [17:0] sum_rdct_x_carry = sum_rdct_x + {16'h0000, rcmb_x_lsb_carry};
+    wire [17:0] sum_rdct_y_carry = sum_rdct_y + {16'h0000, rcmb_y_lsb_carry};
+    
+    
+    //
+    //
+    //
+    always @(posedge clk)
+        //
+        if (rst) clear_rdct;
+        else begin
+            //
+            clear_rdct;
+            //
+            if (busy_now && rcmb_xy_valid_dly3)
+                //
+                case (rcmb_xy_bank_dly3)
+                                    
+                    BANK_RCMB_MH:
+                        if (rcmb_xy_addr_dly3 == 8'd1)
+                            set_rdct(8'd0, sum_rdct_x_carry, sum_rdct_y_carry);
+                        else if (rcmb_xy_addr_dly3 > 8'd1)
+                            set_rdct(rcmb_xy_addr_dly3 - 1'b1, sum_rdct_x, sum_rdct_y);
+                            
+                    BANK_RCMB_EXT:
+                        set_rdct(word_index_last, rcmb_x_dout_dly3, rcmb_y_dout_dly3);
+
+                endcase
+            //
+        end
+
+
+
+    //
+    // Busy
+    //
+    always @(posedge clk)
+        //
+        if (rst) busy_now <= 1'b0;
+        else begin
+            if (rdy && ena) busy_now <= 1'b1;
+            //if (!rdy && !busy_now) rdy <= 1'b1;
+        end
+
+    
+endmodule
diff --git a/rtl/modexpng_storage_block.v b/rtl/modexpng_storage_block.v
new file mode 100644
index 0000000..d6f9fb1
--- /dev/null
+++ b/rtl/modexpng_storage_block.v
@@ -0,0 +1,226 @@
+module modexpng_storage_block
+(
+    clk, rst,
+    
+    wr_wide_xy_ena,
+    wr_wide_xy_bank,
+    wr_wide_xy_addr,
+    wr_wide_x_din,
+    wr_wide_y_din,
+
+    wr_narrow_xy_ena,
+    wr_narrow_xy_bank,
+    wr_narrow_xy_addr,
+    wr_narrow_x_din,
+    wr_narrow_y_din,
+
+    rd_wide_xy_ena,
+    rd_wide_xy_ena_aux,
+    rd_wide_xy_bank,
+    rd_wide_xy_bank_aux,
+    rd_wide_xy_addr,
+    rd_wide_xy_addr_aux,
+    rd_wide_x_dout,
+    rd_wide_y_dout,
+    rd_wide_x_dout_aux,
+    rd_wide_y_dout_aux,
+    
+    rd_narrow_xy_ena,
+    rd_narrow_xy_bank,
+    rd_narrow_xy_addr,
+    rd_narrow_x_dout,
+    rd_narrow_y_dout
+);
+
+
+    //
+    // Headers
+    //
+    `include "../rtl_1/modexpng_parameters_x8_old.vh"
+
+
+    //
+    // Ports
+    //
+    input         clk;
+    input           rst;
+
+    input         wr_wide_xy_ena;
+    input  [ 1:0] wr_wide_xy_bank;
+    input  [ 7:0] wr_wide_xy_addr;
+    input  [17:0] wr_wide_x_din;
+    input  [17:0] wr_wide_y_din;
+    
+    input         wr_narrow_xy_ena;
+    input  [ 1:0] wr_narrow_xy_bank;
+    input  [ 7:0] wr_narrow_xy_addr;
+    input  [17:0] wr_narrow_x_din;
+    input  [17:0] wr_narrow_y_din;
+
+    input                     rd_wide_xy_ena;
+    input                     rd_wide_xy_ena_aux;
+    input  [             1:0] rd_wide_xy_bank;
+    input  [             1:0] rd_wide_xy_bank_aux;
+    input  [ 8*NUM_MULTS/2-1:0] rd_wide_xy_addr;
+    input  [           8-1:0] rd_wide_xy_addr_aux;
+    output [18*NUM_MULTS/2-1:0] rd_wide_x_dout;
+    output [18*NUM_MULTS/2-1:0] rd_wide_y_dout;
+    output [          18-1:0] rd_wide_x_dout_aux;
+    output [          18-1:0] rd_wide_y_dout_aux;
+    
+    input                     rd_narrow_xy_ena;
+    input  [             1:0] rd_narrow_xy_bank;
+    input  [ 7:0] rd_narrow_xy_addr;
+    output [18-1:0] rd_narrow_x_dout;
+    output [18-1:0] rd_narrow_y_dout;
+
+    
+    //
+    // Internal Registers
+    //
+    reg rd_wide_xy_reg_ena     = 1'b0;
+    reg rd_wide_xy_reg_ena_aux = 1'b0;
+    reg rd_narrow_xy_reg_ena   = 1'b0;
+
+    always @(posedge clk) begin
+        //
+        rd_wide_xy_reg_ena     <= rst ? 1'b0 : rd_wide_xy_ena;
+        rd_wide_xy_reg_ena_aux <= rst ? 1'b0 : rd_wide_xy_ena_aux;
+        rd_narrow_xy_reg_ena   <= rst ? 1'b0 : rd_narrow_xy_ena;
+        //
+    end
+
+    
+    //
+    // Helper Signals
+    //
+    wire [2+8-1:0] wr_wide_xy_offset;
+    wire [2+8-1:0] rd_wide_xy_offset[0:NUM_MULTS/2-1];
+    wire [2+8-1:0] rd_wide_xy_offset_aux;
+    wire [2+8-1:0] wr_narrow_xy_offset;
+    wire [2+8-1:0] rd_narrow_xy_offset;
+
+    assign wr_wide_xy_offset     = {wr_wide_xy_bank,     wr_wide_xy_addr};
+    assign rd_wide_xy_offset_aux = {rd_wide_xy_bank_aux, rd_wide_xy_addr_aux};
+    assign wr_narrow_xy_offset   = {wr_narrow_xy_bank,   wr_narrow_xy_addr};
+    assign rd_narrow_xy_offset   = {rd_narrow_xy_bank,   rd_narrow_xy_addr};
+    
+
+    //
+    // "Wide" Storage
+    //
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        begin : gen_wide_bram
+            //
+            assign rd_wide_xy_offset[z] = {rd_wide_xy_bank, rd_wide_xy_addr[8*z+:8]};
+            //
+            ip_bram_18k wide_bram_x
+            (
+                .clka   (clk),
+                .clkb   (clk),
+                
+                .ena    (wr_wide_xy_ena),
+                .wea    (wr_wide_xy_ena),
+                .addra  (wr_wide_xy_offset),
+                .dina   (wr_wide_x_din),
+                
+                .enb    (rd_wide_xy_ena),
+                .regceb (rd_wide_xy_reg_ena),
+                .addrb  (rd_wide_xy_offset[z]),
+                .doutb  (rd_wide_x_dout[18*z+:18])
+            );
+            //
+            ip_bram_18k wide_bram_y
+            (
+                .clka   (clk),
+                .clkb   (clk),
+
+                .ena    (wr_wide_xy_ena),
+                .wea    (wr_wide_xy_ena),
+                .addra  (wr_wide_xy_offset),
+                .dina   (wr_wide_y_din),
+            
+                .enb    (rd_wide_xy_ena),
+                .regceb (rd_wide_xy_reg_ena),
+                .addrb  (rd_wide_xy_offset[z]),
+                .doutb  (rd_wide_y_dout[18*z+:18])
+            );
+            //
+        end
+    endgenerate
+
+    
+    //
+    // Auxilary Storage
+    //
+    ip_bram_18k wide_bram_x_aux
+    (
+        .clka   (clk),
+        .clkb   (clk),
+
+        .ena    (wr_wide_xy_ena),
+        .wea    (wr_wide_xy_ena),
+        .addra  (wr_wide_xy_offset),
+        .dina   (wr_wide_x_din),
+
+        .enb    (rd_wide_xy_ena_aux),
+        .regceb (rd_wide_xy_reg_ena_aux),
+        .addrb  (rd_wide_xy_offset_aux),
+        .doutb  (rd_wide_x_dout_aux)
+    );
+    //
+    ip_bram_18k wide_bram_y_aux
+    (
+        .clka   (clk),
+        .clkb   (clk),
+
+        .ena    (wr_wide_xy_ena),
+        .wea    (wr_wide_xy_ena),
+        .addra  (wr_wide_xy_offset),
+        .dina   (wr_wide_y_din),
+
+        .enb    (rd_wide_xy_ena_aux),
+        .regceb (rd_wide_xy_reg_ena_aux),
+        .addrb  (rd_wide_xy_offset_aux),
+        .doutb  (rd_wide_y_dout_aux)
+    );
+
+            
+    //
+    // "Narrow" Storage
+    //
+    ip_bram_18k narrow_bram_x
+    (
+        .clka   (clk),
+        .clkb   (clk),
+
+        .ena    (wr_narrow_xy_ena),
+        .wea    (wr_narrow_xy_ena),
+        .addra  (wr_narrow_xy_offset),
+        .dina   (wr_narrow_x_din),
+    
+        .enb    (rd_narrow_xy_ena),
+        .regceb (rd_narrow_xy_reg_ena),
+        .addrb  (rd_narrow_xy_offset),
+        .doutb  (rd_narrow_x_dout)
+    );
+
+    ip_bram_18k narrow_bram_y
+    (
+        .clka   (clk),
+        .clkb   (clk),
+
+        .ena    (wr_narrow_xy_ena),
+        .wea    (wr_narrow_xy_ena),
+        .addra  (wr_narrow_xy_offset),
+        .dina   (wr_narrow_y_din),
+    
+        .enb    (rd_narrow_xy_ena),
+        .regceb (rd_narrow_xy_reg_ena),
+        .addrb  (rd_narrow_xy_offset),
+        .doutb  (rd_narrow_y_dout)
+    );
+
+
+endmodule
diff --git a/rtl/modexpng_storage_manager.v b/rtl/modexpng_storage_manager.v
new file mode 100644
index 0000000..fa1e4a1
--- /dev/null
+++ b/rtl/modexpng_storage_manager.v
@@ -0,0 +1,200 @@
+module modexpng_storage_manager
+(
+    clk, rst,
+    
+    wr_wide_xy_ena,
+    wr_wide_xy_bank,
+    wr_wide_xy_addr,
+    wr_wide_x_din,
+    wr_wide_y_din,
+
+    wr_narrow_xy_ena,
+    wr_narrow_xy_bank,
+    wr_narrow_xy_addr,
+    wr_narrow_x_din,
+    wr_narrow_y_din,
+    
+    ext_wide_xy_ena,
+    ext_wide_xy_bank,
+    ext_wide_xy_addr,
+    ext_wide_x_din,
+    ext_wide_y_din,
+
+    ext_narrow_xy_ena,
+    ext_narrow_xy_bank,
+    ext_narrow_xy_addr,
+    ext_narrow_x_din,
+    ext_narrow_y_din,
+    
+    rcmb_wide_xy_ena,
+    rcmb_wide_xy_bank,
+    rcmb_wide_xy_addr,
+    rcmb_wide_x_din,
+    rcmb_wide_y_din,
+
+    rcmb_narrow_xy_ena,
+    rcmb_narrow_xy_bank,
+    rcmb_narrow_xy_addr,
+    rcmb_narrow_x_din,
+    rcmb_narrow_y_din
+);
+
+
+    //
+    // Headers
+    //
+    `include "../rtl_1/modexpng_parameters_x8_old.vh"
+
+
+    //
+    // Ports
+    //
+    input         clk;
+    input         rst;
+
+    output        wr_wide_xy_ena;
+    output [ 1:0] wr_wide_xy_bank;
+    output [ 7:0] wr_wide_xy_addr;
+    output [17:0] wr_wide_x_din;
+    output [17:0] wr_wide_y_din;
+
+    output        wr_narrow_xy_ena;
+    output [ 1:0] wr_narrow_xy_bank;
+    output [ 7:0] wr_narrow_xy_addr;
+    output [17:0] wr_narrow_x_din;
+    output [17:0] wr_narrow_y_din;
+   
+    input         ext_wide_xy_ena;
+    input  [ 1:0] ext_wide_xy_bank;
+    input  [ 7:0] ext_wide_xy_addr;
+    input  [17:0] ext_wide_x_din;
+    input  [17:0] ext_wide_y_din;
+
+    input         ext_narrow_xy_ena;
+    input  [ 1:0] ext_narrow_xy_bank;
+    input  [ 7:0] ext_narrow_xy_addr;
+    input  [17:0] ext_narrow_x_din;
+    input  [17:0] ext_narrow_y_din;
+    
+    input         rcmb_wide_xy_ena;
+    input  [ 1:0] rcmb_wide_xy_bank;
+    input  [ 7:0] rcmb_wide_xy_addr;
+    input  [17:0] rcmb_wide_x_din;
+    input  [17:0] rcmb_wide_y_din;
+
+    input         rcmb_narrow_xy_ena;
+    input  [ 1:0] rcmb_narrow_xy_bank;
+    input  [ 7:0] rcmb_narrow_xy_addr;
+    input  [17:0] rcmb_narrow_x_din;
+    input  [17:0] rcmb_narrow_y_din;
+    
+    
+    reg        wr_wide_xy_ena_reg = 1'b0;
+    reg [ 1:0] wr_wide_xy_bank_reg;
+    reg [ 7:0] wr_wide_xy_addr_reg;
+    reg [17:0] wr_wide_x_din_reg;
+    reg [17:0] wr_wide_y_din_reg;
+
+    reg        wr_narrow_xy_ena_reg = 1'b0;
+    reg [ 1:0] wr_narrow_xy_bank_reg;
+    reg [ 7:0] wr_narrow_xy_addr_reg;
+    reg [17:0] wr_narrow_x_din_reg;
+    reg [17:0] wr_narrow_y_din_reg;
+    
+    task _update_wide;
+        input        xy_ena;
+        input [ 1:0] xy_bank;
+        input [ 7:0] xy_addr;
+        input [17:0] x_din;
+        input [17:0] y_din;
+        begin
+            wr_wide_xy_ena_reg  <= xy_ena;
+            wr_wide_xy_bank_reg <= xy_bank;
+            wr_wide_xy_addr_reg <= xy_addr;
+            wr_wide_x_din_reg   <= x_din;
+            wr_wide_y_din_reg   <= y_din;
+        end
+    endtask
+    
+    task _update_narrow;
+        input        xy_ena;
+        input [ 1:0] xy_bank;
+        input [ 7:0] xy_addr;
+        input [17:0] x_din;
+        input [17:0] y_din;
+        begin
+            wr_narrow_xy_ena_reg  <= xy_ena;
+            wr_narrow_xy_bank_reg <= xy_bank;
+            wr_narrow_xy_addr_reg <= xy_addr;
+            wr_narrow_x_din_reg   <= x_din;
+            wr_narrow_y_din_reg   <= y_din;
+        end
+    endtask
+    
+    task enable_wide;
+        input [ 1:0] xy_bank;
+        input [ 7:0] xy_addr;
+        input [17:0] x_din;
+        input [17:0] y_din;
+        begin
+            _update_wide(1'b1, xy_bank, xy_addr, x_din, y_din);
+        end
+    endtask
+    
+    task enable_narrow;
+        input [ 1:0] xy_bank;
+        input [ 7:0] xy_addr;
+        input [17:0] x_din;
+        input [17:0] y_din;
+        begin
+            _update_narrow(1'b1, xy_bank, xy_addr, x_din, y_din);
+        end
+    endtask
+    
+    task disable_wide;
+        begin
+            _update_wide(1'b0, 2'bXX, 8'hXX, {18{1'bX}}, {18{1'bX}});
+        end
+    endtask
+    
+    task disable_narrow;
+        begin
+            _update_narrow(1'b0, 2'bXX, 8'hXX, {18{1'bX}}, {18{1'bX}});
+        end
+    endtask
+    
+    always @(posedge clk)
+        //
+        if (rst)                       disable_wide;
+        else begin
+            //
+            if      (ext_wide_xy_ena)  enable_wide(ext_wide_xy_bank,  ext_wide_xy_addr,  ext_wide_x_din,  ext_wide_y_din);
+            else if (rcmb_wide_xy_ena) enable_wide(rcmb_wide_xy_bank, rcmb_wide_xy_addr, rcmb_wide_x_din, rcmb_wide_y_din);
+            else                       disable_wide;
+            //
+        end
+            
+    always @(posedge clk)
+        //
+        if (rst)                         disable_narrow;
+        else begin
+            //
+            if      (ext_narrow_xy_ena)  enable_narrow(ext_narrow_xy_bank,  ext_narrow_xy_addr,  ext_narrow_x_din,  ext_narrow_y_din);
+            else if (rcmb_narrow_xy_ena) enable_narrow(rcmb_narrow_xy_bank, rcmb_narrow_xy_addr, rcmb_narrow_x_din, rcmb_narrow_y_din);
+            else                         disable_narrow;
+            //
+        end
+
+    assign wr_wide_xy_ena  = wr_wide_xy_ena_reg;
+    assign wr_wide_xy_bank = wr_wide_xy_bank_reg;
+    assign wr_wide_xy_addr = wr_wide_xy_addr_reg;
+    assign wr_wide_x_din   = wr_wide_x_din_reg;
+    assign wr_wide_y_din   = wr_wide_y_din_reg;
+
+    assign wr_narrow_xy_ena  = wr_narrow_xy_ena_reg;
+    assign wr_narrow_xy_bank = wr_narrow_xy_bank_reg;
+    assign wr_narrow_xy_addr = wr_narrow_xy_addr_reg;
+    assign wr_narrow_x_din   = wr_narrow_x_din_reg;
+    assign wr_narrow_y_din   = wr_narrow_y_din_reg;
+    
+endmodule



More information about the Commits mailing list