[Cryptech-Commits] [core/math/modexpng] 24/92: Started working on the pipelined Montgomery modular multiplier. Currently can do the "square" part of the multiplication, i.e. compute the twice larger intermediate product AB = A * B.

git at cryptech.is git at cryptech.is
Sat Mar 14 18:19:03 UTC 2020


This is an automated email from the git hooks/post-receive script.

paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.

commit 29fb6afd018c601a2e0c7376656d5e37beb565d6
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Oct 1 15:01:43 2019 +0300

    Started working on the pipelined Montgomery modular multiplier. Currently can
    do the "square" part of the multiplication, i.e. compute the twice larger
    intermediate product AB = A * B.
---
 bench/tb_mmm_x8_dual.v            | 327 +++++++++++++++++
 bench/tb_square.v                 | 716 ++++++++++++++++++++++++++++++++++++++
 rtl/dev/temp.txt                  | 384 ++++++++++++++++++++
 rtl/dsp/dsp_array.v               | 111 ++++++
 rtl/dsp/dsp_slice.v               | 125 +++++++
 rtl/modexpng_mac.v                |  54 +++
 rtl/modexpng_mac_array.v          | 116 ++++++
 rtl/modexpng_mem.v                |  93 +++++
 rtl/modexpng_mmm_col_index.v      |  90 +++++
 rtl/modexpng_mmm_din_addr.v       | 167 +++++++++
 rtl/modexpng_mmm_dout_addr.v      | 167 +++++++++
 rtl/modexpng_mmm_fsm.vh           |  24 ++
 rtl/modexpng_mmm_pad.v            | 153 ++++++++
 rtl/modexpng_mmm_transporter.v    | 157 +++++++++
 rtl/modexpng_mmm_x8_dual.v        | 550 +++++++++++++++++++++++++++++
 rtl/modexpng_parameters.vh        |  39 +++
 rtl/modexpng_parameters_x8.vh     |   1 +
 rtl/modexpng_part_recombinator.v  | 623 +++++++++++++++++++++++++++++++++
 rtl/modexpng_recombinator_block.v |  35 ++
 19 files changed, 3932 insertions(+)

diff --git a/bench/tb_mmm_x8_dual.v b/bench/tb_mmm_x8_dual.v
new file mode 100644
index 0000000..aa25900
--- /dev/null
+++ b/bench/tb_mmm_x8_dual.v
@@ -0,0 +1,327 @@
+`timescale 1ns / 1ps
+
+module tb_mmm_x8_dual;
+
+
+    //
+    // Headers
+    //
+    `include "../rtl/modexpng_parameters.vh"
+    `include "../rtl/modexpng_parameters_x8.vh"
+    
+
+    //
+    // Settings
+    //
+    localparam INDEX_WIDTH = 6;
+    
+	wire [INDEX_WIDTH-1:0] index_last = 31;  // 512 bits
+    
+	
+    //
+    // Clock
+    //
+    `define CLK_FREQUENCY_MHZ   100.0
+    `define CLK_PERIOD_NS       (1000.0 / `CLK_FREQUENCY_MHZ)
+    `define CLK_PERIOD_HALF_NS  (0.5 * `CLK_PERIOD_NS)
+    
+	reg clk = 1'b0;
+
+    always begin
+        #`CLK_PERIOD_HALF_NS clk = 1'b1;
+        #`CLK_PERIOD_HALF_NS clk = 1'b0;
+    end    
+
+    
+    //
+    // Reset
+    //
+    reg  rst   = 1'b1;
+	wire rst_n = ~rst;
+    
+    
+    //
+    // Control
+    //
+    reg  ena = 1'b0;
+    wire rdy;
+
+    reg  mode;
+    reg  transfer;
+
+
+    //
+    // Interface
+    //
+
+    
+    //
+    // Interface - Data Buses
+    //
+	wire [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+	wire [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+	wire [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+	wire [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+    
+    //
+    // Interface - Address Buses
+    //
+	wire [INDEX_WIDTH-4:0] x_din_addr;
+	wire [INDEX_WIDTH-4:0] y_din_addr;
+	wire [INDEX_WIDTH-4:0] x_dout_addr;
+	wire [INDEX_WIDTH-4:0] y_dout_addr;
+
+    
+    //
+    // Interface - Enable Buses
+    //
+	wire [        1-1:0] x_din_ena;
+	wire [        1-1:0] y_din_ena;
+	wire [        1-1:0] x_din_reg_ena;
+	wire [        1-1:0] y_din_reg_ena;
+    wire [NUM_MULTS-1:0] x_dout_ena;
+    wire [NUM_MULTS-1:0] y_dout_ena;
+    
+    
+    //
+    // Interface - Bank Buses
+    //
+    wire [3-1:0] x_din_bank;
+    wire [3-1:0] y_din_bank;
+    wire [3-1:0] x_dout_bank;
+    wire [3-1:0] y_dout_bank;
+
+
+    //
+    // Operands
+    //
+    reg [WORD_WIDTH-1:0] T1[0:2**INDEX_WIDTH-1];
+    reg [WORD_WIDTH-1:0] T2[0:2**INDEX_WIDTH-1];
+    reg [WORD_WIDTH-1:0] N[0:2**INDEX_WIDTH-1];
+    reg [WORD_WIDTH-1:0] N_COEFF[0:2**INDEX_WIDTH];
+
+    
+    //
+    // Memories
+    //
+    genvar z;
+    generate for (z=0; z<NUM_MULTS; z=z+1)
+        //
+        begin : gen_z_mem
+            //
+            modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+            (
+                .MEM_WIDTH(WORD_WIDTH),
+                .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+            )
+            gen_z_mem_x
+            (
+                .clk        (clk),
+
+                .a_addr     ({x_dout_bank, x_dout_addr}),
+                .a_en       (x_dout_ena[z]),
+                .a_wr       (x_dout_ena[z]),
+                .a_in       (x_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+                .a_out      (), // unused
+
+                .b_addr     ({x_din_bank, x_din_addr}),
+                .b_en       (x_din_ena),
+                .b_reg_en   (x_din_reg_ena),
+                .b_out      (x_din[z*WORD_WIDTH+:WORD_WIDTH])
+            );
+            //
+            modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+            (
+                .MEM_WIDTH(WORD_WIDTH),
+                .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+            )
+            gen_z_mem_y
+            (
+                .clk        (clk),
+
+                .a_addr     ({y_dout_bank, y_dout_addr}),
+                .a_en       (y_dout_ena[z]),
+                .a_wr       (y_dout_ena[z]),
+                .a_in       (y_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+                .a_out      (), // unused
+
+                .b_addr     ({y_din_bank, y_din_addr}),
+                .b_en       (y_din_ena),
+                .b_reg_en   (y_din_reg_ena),
+                .b_out      (y_din[z*WORD_WIDTH+:WORD_WIDTH])
+            );
+            //
+        end
+        //
+    endgenerate
+    
+
+    // T1 / T2
+    // N  / N_COEFF
+    // AB_LSB
+    // AB_MSB
+    // M
+    // Q_LSB
+    // Q_MSB
+    // ?
+
+
+    //
+    // Operands - Values
+    //
+    initial begin
+        //
+        T1[ 0] = 18'h0b27b; T1[ 1] = 18'h0fc7d; T1[ 2] = 18'h0a214; T1[ 3] = 18'h08d2b;
+        T1[ 4] = 18'h1c80c; T1[ 5] = 18'h145f1; T1[ 6] = 18'h00db6; T1[ 7] = 18'h1cf0f;
+        T1[ 8] = 18'h19386; T1[ 9] = 18'h02ad9; T1[10] = 18'h1a8b5; T1[11] = 18'h1479b;
+        T1[12] = 18'h08b5f; T1[13] = 18'h14806; T1[14] = 18'h0e6f7; T1[15] = 18'h0ce9d;
+        T1[16] = 18'h0cbc2; T1[17] = 18'h16ef1; T1[18] = 18'h0e14e; T1[19] = 18'h1796f;
+        T1[20] = 18'h14901; T1[21] = 18'h06666; T1[22] = 18'h0cb9f; T1[23] = 18'h09ab4;
+        T1[24] = 18'h12ffc; T1[25] = 18'h0a86d; T1[26] = 18'h19d35; T1[27] = 18'h0cda9;
+        T1[28] = 18'h16a19; T1[29] = 18'h09a36; T1[30] = 18'h0b176; T1[31] = 18'h0e0dc;
+        //
+        T2[ 0] = 18'h0b21a; T2[ 1] = 18'h13e71; T2[ 2] = 18'h03459; T2[ 3] = 18'h1063f;
+        T2[ 4] = 18'h18cef; T2[ 5] = 18'h1b8a5; T2[ 6] = 18'h082d1; T2[ 7] = 18'h1b1be;
+        T2[ 8] = 18'h18979; T2[ 9] = 18'h1409a; T2[10] = 18'h1713c; T2[11] = 18'h0cda3;
+        T2[12] = 18'h11c7d; T2[13] = 18'h0c943; T2[14] = 18'h12d7c; T2[15] = 18'h1531e;
+        T2[16] = 18'h0a45a; T2[17] = 18'h1c637; T2[18] = 18'h0906a; T2[19] = 18'h1670e;
+        T2[20] = 18'h12f78; T2[21] = 18'h08ce6; T2[22] = 18'h1c5c7; T2[23] = 18'h1292d;
+        T2[24] = 18'h0fc4b; T2[25] = 18'h064fb; T2[26] = 18'h0cc3c; T2[27] = 18'h19b37;
+        T2[28] = 18'h1b721; T2[29] = 18'h0f424; T2[30] = 18'h0f608; T2[31] = 18'h03e9b;
+        //
+        N[ 0] = 18'h00a9d; N[ 1] = 18'h01175; N[ 2] = 18'h0254f; N[ 3] = 18'h0ee38;
+        N[ 4] = 18'h00a6a; N[ 5] = 18'h0c7bd; N[ 6] = 18'h0ddac; N[ 7] = 18'h069fe;
+        N[ 8] = 18'h0e9d6; N[ 9] = 18'h0b6bf; N[10] = 18'h09230; N[11] = 18'h04fc5;
+        N[12] = 18'h05c9f; N[13] = 18'h09502; N[14] = 18'h0cbc5; N[15] = 18'h03109;
+        N[16] = 18'h08029; N[17] = 18'h0b27c; N[18] = 18'h0eeb8; N[19] = 18'h0c191;
+        N[20] = 18'h0ff86; N[21] = 18'h027ab; N[22] = 18'h07d76; N[23] = 18'h0ff1a;
+        N[24] = 18'h02afc; N[25] = 18'h0b25a; N[26] = 18'h0d3c1; N[27] = 18'h05589;
+        N[28] = 18'h09f7c; N[29] = 18'h0ddd6; N[30] = 18'h0b4fc; N[31] = 18'h0e8e7;
+        //
+        N_COEFF[ 0] = 18'h0344b; N_COEFF[ 1] = 18'h0ca66; N_COEFF[ 2] = 18'h0d9e8; N_COEFF[ 3] = 18'h070d5;
+        N_COEFF[ 4] = 18'h0ce4b; N_COEFF[ 5] = 18'h049b2; N_COEFF[ 6] = 18'h0abb3; N_COEFF[ 7] = 18'h0c3b2;
+        N_COEFF[ 8] = 18'h0ad38; N_COEFF[ 9] = 18'h05672; N_COEFF[10] = 18'h0fd47; N_COEFF[11] = 18'h06671;
+        N_COEFF[12] = 18'h00b7f; N_COEFF[13] = 18'h0fa35; N_COEFF[14] = 18'h0d4ac; N_COEFF[15] = 18'h0f1ca;
+        N_COEFF[16] = 18'h08e0a; N_COEFF[17] = 18'h05858; N_COEFF[18] = 18'h02dc6; N_COEFF[19] = 18'h08cfc;
+        N_COEFF[20] = 18'h01941; N_COEFF[21] = 18'h0f855; N_COEFF[22] = 18'h01e43; N_COEFF[23] = 18'h053f0;
+        N_COEFF[24] = 18'h0a479; N_COEFF[25] = 18'h0ae7e; N_COEFF[26] = 18'h05c66; N_COEFF[27] = 18'h02413;
+        N_COEFF[28] = 18'h0b5f8; N_COEFF[29] = 18'h0eb06; N_COEFF[30] = 18'h0de5b; N_COEFF[31] = 18'h0a751;
+        N_COEFF[32] = 18'h0c1ec;
+        //
+    end
+    
+    
+    //
+    // Load Interface
+    //
+    wire                   load_phase;
+    wire [  INDEX_WIDTH:0] load_xy_addr;
+    wire                   load_xy_addr_vld;
+    wire                   load_xy_req;
+    reg  [ WORD_WIDTH-1:0] load_x_din;
+    reg  [ WORD_WIDTH-1:0] load_y_din;
+    reg  [ WORD_WIDTH-1:0] load_x_pipe;
+    reg  [ WORD_WIDTH-1:0] load_y_pipe;
+
+    always @(posedge clk)
+        //
+        if (load_xy_addr_vld) begin
+        
+            if (!load_phase) begin
+                load_x_pipe <= T1[load_xy_addr];
+                load_y_pipe <= T2[load_xy_addr];
+            end else begin
+                load_x_pipe <= !load_xy_addr[INDEX_WIDTH] ? N[load_xy_addr] : {WORD_WIDTH{1'bX}};
+                load_y_pipe <= N_COEFF[load_xy_addr];
+            end 
+        end
+
+    always @(posedge clk)
+        //
+        if (load_xy_req)
+            {load_y_din, load_x_din} <= {load_y_pipe, load_x_pipe};
+        else
+            {load_y_din, load_x_din} <= {2*WORD_WIDTH{1'bX}};
+
+        
+    //
+    // UUT
+    //
+    modexpng_mmm_x8_dual #
+    (
+        .INDEX_WIDTH(INDEX_WIDTH)
+    )
+    uut
+    (
+		.clk                (clk),
+		.rst_n              (rst_n),
+        
+		.ena                (ena),
+		.rdy                (rdy),
+        
+        .mode               (mode),
+        .transfer           (transfer),
+
+		.index_last         (index_last),
+
+        .x_din              (x_din),
+        .y_din              (y_din),
+        .x_dout             (x_dout),
+        .y_dout             (y_dout),
+
+        .x_din_addr         (x_din_addr),
+        .y_din_addr         (y_din_addr),
+        .x_dout_addr        (x_dout_addr),
+        .y_dout_addr        (y_dout_addr),
+
+        .x_din_ena          (x_din_ena),
+        .y_din_ena          (y_din_ena),
+        .x_dout_ena         (x_dout_ena),
+        .y_dout_ena         (y_dout_ena),
+        
+        .x_din_reg_ena      (x_din_reg_ena),
+        .y_din_reg_ena      (y_din_reg_ena),
+    
+        .x_din_bank         (x_din_bank),
+        .y_din_bank         (y_din_bank),
+        .x_dout_bank        (x_dout_bank),
+        .y_dout_bank        (y_dout_bank),
+        
+        .load_phase         (load_phase),
+        .load_xy_addr       (load_xy_addr),
+        .load_xy_addr_vld   (load_xy_addr_vld),
+        .load_xy_req        (load_xy_req),
+        .load_x_din         (load_x_din),
+        .load_y_din         (load_y_din)
+	);
+
+
+    //
+    // Script
+    //
+	initial begin
+        #(100.0*`CLK_PERIOD_NS)     rst      = 1'b0;
+        #(100.0*`CLK_PERIOD_NS)     ena      = 1'b1;
+                                    transfer = 1'b1;
+                                    mode     = 1'b0;
+        #(  1.0*`CLK_PERIOD_NS)     ena      = 1'b0;
+                                    transfer = 1'bX;
+                                    mode     = 1'bX;
+                                    
+        while (!rdy) #`CLK_PERIOD_NS;
+        
+        #(100.0*`CLK_PERIOD_NS)     ena      = 1'b1;
+                                    transfer = 1'b0;
+                                    mode     = 1'b0;
+        #(  1.0*`CLK_PERIOD_NS)     ena      = 1'b0;
+                                    transfer = 1'bX;
+                                    mode     = 1'bX;
+                                    
+        while (!rdy) #`CLK_PERIOD_NS;
+        
+	end
+
+      
+endmodule
+
diff --git a/bench/tb_square.v b/bench/tb_square.v
new file mode 100644
index 0000000..61e5d8a
--- /dev/null
+++ b/bench/tb_square.v
@@ -0,0 +1,716 @@
+`timescale 1ns / 1ps
+
+module tb_square;
+
+
+    //
+    // Headers
+    //
+    `include "../rtl/modexpng_parameters.vh"
+    `include "../rtl/modexpng_parameters_x8.vh"
+    `include "../rtl/modexpng_mmm_fsm.vh"
+
+
+    //
+    // Clock
+    //
+    `define CLK_FREQUENCY_MHZ   100.0
+    `define CLK_PERIOD_NS       (1000.0 / `CLK_FREQUENCY_MHZ)
+    `define CLK_PERIOD_HALF_NS  (0.5 * `CLK_PERIOD_NS)
+    
+	reg clk = 1'b0;
+
+    always begin
+        #`CLK_PERIOD_HALF_NS clk = 1'b1;
+        #`CLK_PERIOD_HALF_NS clk = 1'b0;
+    end
+    
+    
+    //
+    // Reset
+    //
+    reg rst = 1'b1;
+    
+    
+
+    //
+    // T1, T2
+    //
+    reg [17:0] T1[0:31];
+    reg [17:0] T2[0:31];
+    reg [17:0] AB[0:63];
+
+
+    //
+    // Init
+    //
+    initial begin
+        //
+        T1[ 0] = 18'h0f13e; T1[ 1] = 18'h0daf6; T1[ 2] = 18'h0aaa9; T1[ 3] = 18'h0c2c2;
+        T1[ 4] = 18'h0fc5f; T1[ 5] = 18'h12164; T1[ 6] = 18'h14375; T1[ 7] = 18'h15615;
+        T1[ 8] = 18'h0d8e2; T1[ 9] = 18'h0ec15; T1[10] = 18'h17c46; T1[11] = 18'h0c922;
+        T1[12] = 18'h08f00; T1[13] = 18'h152f9; T1[14] = 18'h0b0b6; T1[15] = 18'h0ce87;
+        T1[16] = 18'h178f2; T1[17] = 18'h09efb; T1[18] = 18'h0409d; T1[19] = 18'h11104;
+        T1[20] = 18'h0b4a6; T1[21] = 18'h158a6; T1[22] = 18'h0514e; T1[23] = 18'h0ec55;
+        T1[24] = 18'h11e73; T1[25] = 18'h11ddd; T1[26] = 18'h07bd4; T1[27] = 18'h0638b;
+        T1[28] = 18'h0e805; T1[29] = 18'h11c4f; T1[30] = 18'h0a2eb; T1[31] = 18'h05454;
+        //
+        T2[ 0] = 18'h1a479; T2[ 1] = 18'h102f5; T2[ 2] = 18'h10e72; T2[ 3] = 18'h120b1;
+        T2[ 4] = 18'h169cd; T2[ 5] = 18'h1d0c4; T2[ 6] = 18'h11462; T2[ 7] = 18'h12015;
+        T2[ 8] = 18'h16fca; T2[ 9] = 18'h1044f; T2[10] = 18'h122b4; T2[11] = 18'h10a5a;
+        T2[12] = 18'h12620; T2[13] = 18'h0e01a; T2[14] = 18'h095cd; T2[15] = 18'h1278a;
+        T2[16] = 18'h10763; T2[17] = 18'h09fe7; T2[18] = 18'h0d35c; T2[19] = 18'h10e24;
+        T2[20] = 18'h1527d; T2[21] = 18'h115b3; T2[22] = 18'h05443; T2[23] = 18'h1190a;
+        T2[24] = 18'h0fcc3; T2[25] = 18'h115e2; T2[26] = 18'h0a398; T2[27] = 18'h0608d;
+        T2[28] = 18'h13075; T2[29] = 18'h0d816; T2[30] = 18'h0bb4c; T2[31] = 18'h04e8a;
+        //
+        AB[ 0] = 18'h0be4e; AB[ 1] = 18'h0fed7; AB[ 2] = 18'h09496; AB[ 3] = 18'h07181;
+        AB[ 4] = 18'h0ee73; AB[ 5] = 18'h04692; AB[ 6] = 18'h0141a; AB[ 7] = 18'h0078c;
+        AB[ 8] = 18'h030eb; AB[ 9] = 18'h0217c; AB[10] = 18'h0696f; AB[11] = 18'h0a165;
+        AB[12] = 18'h0b753; AB[13] = 18'h04af9; AB[14] = 18'h0ed7c; AB[15] = 18'h079ce;
+        AB[16] = 18'h0e863; AB[17] = 18'h097df; AB[18] = 18'h07984; AB[19] = 18'h048af;
+        AB[20] = 18'h0197f; AB[21] = 18'h0206a; AB[22] = 18'h027e7; AB[23] = 18'h04b3a;
+        AB[24] = 18'h03312; AB[25] = 18'h03b56; AB[26] = 18'h04487; AB[27] = 18'h0bd6a;
+        AB[28] = 18'h04e4b; AB[29] = 18'h069ca; AB[30] = 18'h0f994; AB[31] = 18'h0dd4e;
+        AB[32] = 18'h1b024; AB[33] = 18'h0127f; AB[34] = 18'h02631; AB[35] = 18'h0186b;
+        AB[36] = 18'h03adb; AB[37] = 18'h05368; AB[38] = 18'h059a5; AB[39] = 18'h002e0;
+        AB[40] = 18'h0b78a; AB[41] = 18'h016f3; AB[42] = 18'h0b58d; AB[43] = 18'h03ddb;
+        AB[44] = 18'h078b0; AB[45] = 18'h0073b; AB[46] = 18'h07337; AB[47] = 18'h0c7b0;
+        AB[48] = 18'h00668; AB[49] = 18'h0106d; AB[50] = 18'h01a44; AB[51] = 18'h05ee3;
+        AB[52] = 18'h0462d; AB[53] = 18'h0fdeb; AB[54] = 18'h05f85; AB[55] = 18'h02af9;
+        AB[56] = 18'h0e1c0; AB[57] = 18'h00989; AB[58] = 18'h01201; AB[59] = 18'h0e194;
+        AB[60] = 18'h07f93; AB[61] = 18'h0e739; AB[62] = 18'h07cf6; AB[63] = 18'h019df;
+        //
+    end
+    
+
+    //
+    // BRAMs
+    //
+    reg        tb_fat_bram_xy_ena = 1'b0;
+    reg [ 2:0] tb_fat_bram_xy_bank;
+    reg [ 7:0] tb_fat_bram_xy_addr;
+    reg [17:0] tb_fat_bram_x_din;
+    reg [17:0] tb_fat_bram_y_din;
+
+    reg        mgr_fat_bram_xy_ena = 1'b0;
+    reg [ 2:0] mgr_fat_bram_xy_bank;
+    reg [ 7:0] mgr_fat_bram_xy_addr;
+    reg [17:0] mgr_fat_bram_x_din;
+    reg [17:0] mgr_fat_bram_y_din;
+    
+    reg         mac_fat_bram_xy_ena = 1'b0;
+    reg         mac_fat_bram_xy_reg_ena = 1'b0;
+    reg  [ 2:0] mac_fat_bram_xy_bank;
+    reg  [ 7:0] mac_fat_bram_xy_addr[0:3];
+    wire [17:0] mac_fat_bram_x_dout[0:3];
+    wire [17:0] mac_fat_bram_y_dout[0:3];
+    
+    reg        tb_slim_bram_xy_ena = 1'b0;
+    reg [ 1:0] tb_slim_bram_xy_bank;
+    reg [ 7:0] tb_slim_bram_xy_addr;
+    reg [17:0] tb_slim_bram_x_din;
+    reg [17:0] tb_slim_bram_y_din;
+
+    reg         mac_slim_bram_xy_ena = 1'b0;
+    reg         mac_slim_bram_xy_reg_ena = 1'b0;
+    reg  [ 1:0] mac_slim_bram_xy_bank;
+    reg  [ 7:0] mac_slim_bram_xy_addr;
+    reg  [ 7:0] mac_slim_bram_xy_addr_dly;
+    wire [17:0] mac_slim_bram_x_dout;
+    wire [17:0] mac_slim_bram_y_dout;
+    
+    always @(posedge clk)
+        //
+        mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr;
+    
+    reg         mac_slim_bram_xy_reg_ena_dly = 1'b0;
+    always @(posedge clk)
+        mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena;
+    
+    
+    
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        begin : gen_fat_bram
+            //
+            ip_bram_36k fat_bram_x
+            (
+                .clka   (clk),
+                .ena    (mgr_fat_bram_xy_ena),
+                .wea    (mgr_fat_bram_xy_ena),
+                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+                .dina   (mgr_fat_bram_x_din),
+            
+                .clkb   (clk),
+                .enb    (mac_fat_bram_xy_ena),
+                .regceb (mac_fat_bram_xy_reg_ena),
+                .addrb  ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+                .doutb  (mac_fat_bram_x_dout[z])
+            );
+            //
+            ip_bram_36k fat_bram_y
+            (
+                .clka   (clk),
+                .ena    (mgr_fat_bram_xy_ena),
+                .wea    (mgr_fat_bram_xy_ena),
+                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+                .dina   (mgr_fat_bram_y_din),
+            
+                .clkb   (clk),
+                .enb    (mac_fat_bram_xy_ena),
+                .regceb (mac_fat_bram_xy_reg_ena),
+                .addrb  ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+                .doutb  (mac_fat_bram_y_dout[z])
+            );
+            //
+        end
+    endgenerate
+
+    ip_bram_18k slim_bram_x
+    (
+        .clka   (clk),
+        .ena    (tb_slim_bram_xy_ena),
+        .wea    (tb_slim_bram_xy_ena),
+        .addra  ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+        .dina   (tb_slim_bram_x_din),
+    
+        .clkb   (clk),
+        .enb    (mac_slim_bram_xy_ena),
+        .regceb (mac_slim_bram_xy_reg_ena),
+        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+        .doutb  (mac_slim_bram_x_dout)
+    );
+
+    ip_bram_18k slim_bram_y
+    (
+        .clka   (clk),
+        .ena    (tb_slim_bram_xy_ena),
+        .wea    (tb_slim_bram_xy_ena),
+        .addra  ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+        .dina   (tb_slim_bram_y_din),
+    
+        .clkb   (clk),
+        .enb    (mac_slim_bram_xy_ena),
+        .regceb (mac_slim_bram_xy_reg_ena),
+        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+        .doutb  (mac_slim_bram_y_dout)
+    );
+    
+    
+    
+    //
+    // Enable, Ready
+    //
+    reg ena = 1'b0;
+
+    integer i;
+    initial begin
+
+        for (i=0; i<10; i=i+1)
+            wait_clock_tick;
+        
+        rst = 1'b0;
+
+        for (i=0; i<10; i=i+1)
+            wait_clock_tick;
+        
+        tb_fat_bram_xy_ena = 1'b1;
+        tb_slim_bram_xy_ena = 1'b1;
+
+        for (i=0; i<32; i=i+1) begin
+            tb_fat_bram_xy_bank = BANK_FAT_T1T2;
+            tb_fat_bram_xy_addr = i[7:0];
+            tb_fat_bram_x_din = T1[i];
+            tb_fat_bram_y_din = T2[i];
+            
+            tb_slim_bram_xy_bank = BANK_SLIM_T1T2;
+            tb_slim_bram_xy_addr = i[7:0];
+            tb_slim_bram_x_din = T1[i];
+            tb_slim_bram_y_din = T2[i];
+            
+            wait_clock_tick;
+        end
+
+        tb_fat_bram_xy_ena = 1'b0;        
+        tb_slim_bram_xy_ena = 1'b0;
+        
+        tb_fat_bram_xy_bank = {3{1'bX}};
+        tb_fat_bram_xy_addr = {8{1'bX}};
+        tb_fat_bram_x_din = {18{1'bX}};
+        tb_fat_bram_y_din = {18{1'bX}};
+
+        tb_slim_bram_xy_bank = {2{1'bX}};
+        tb_slim_bram_xy_addr = {8{1'bX}};
+        tb_slim_bram_x_din = {18{1'bX}};
+        tb_slim_bram_y_din = {18{1'bX}};
+
+        for (i=0; i<10; i=i+1)
+            wait_clock_tick;
+            
+        ena = 1'b1;
+        wait_clock_tick;
+        ena = 1'b0;
+    
+        for (i=0; i<10000; i=i+1)
+            wait_clock_tick;
+            
+        verify_ab;
+
+    end
+
+    
+    //
+    // DSPs
+    //
+    reg             dsp_x_ce_a;
+    reg             dsp_x_ce_b;
+    reg             dsp_x_ce_b_dly;
+    reg             dsp_x_ce_m;
+    reg             dsp_x_ce_p;
+    reg             dsp_x_ce_mode;
+    
+    reg  [8   -1:0] dsp_x_mode_z = {8{1'b1}};
+    
+    wire [4*18-1:0] dsp_x_a;
+    reg  [1*17-1:0] dsp_x_b;
+    wire [8*47-1:0] dsp_x_p;
+
+    reg             dsp_y_ce_a;
+    reg             dsp_y_ce_b;
+    reg             dsp_y_ce_b_dly;
+    reg             dsp_y_ce_m;
+    reg             dsp_y_ce_p;
+    reg             dsp_y_ce_mode;
+    
+    reg  [8   -1:0] dsp_y_mode_z = {8{1'b1}};
+        
+    wire [4*18-1:0] dsp_y_a;
+    reg  [1*17-1:0] dsp_y_b;
+    wire [8*47-1:0] dsp_y_p;
+        
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        begin : gen_dsp_xy_a_split
+            assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z];
+            assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z];
+        end
+    endgenerate
+    
+    always @(posedge clk)
+        //
+        {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b};
+    
+
+    reg  [8   -1:0] dsp_xy_mode_z_adv1 = {8{1'b1}};
+    reg  [8   -1:0] dsp_xy_mode_z_adv2 = {8{1'b1}};
+    reg  [8   -1:0] dsp_xy_mode_z_adv3 = {8{1'b1}};
+    reg  [8   -1:0] dsp_xy_mode_z_adv4 = {8{1'b1}};
+    
+    dsp_array dsp_x
+    (
+        .clk            (clk),
+        
+        .ce_a           (dsp_x_ce_a),
+        .ce_b           (dsp_x_ce_b),
+        .ce_m           (dsp_x_ce_m),
+        .ce_p           (dsp_x_ce_p),
+        .ce_mode        (dsp_x_ce_mode),
+
+        .mode_z         (dsp_x_mode_z),
+        
+        .a              (dsp_x_a),
+        .b              (dsp_x_b),
+        .p              (dsp_x_p)
+    );
+
+    dsp_array dsp_y
+    (
+        .clk            (clk),
+        
+        .ce_a           (dsp_y_ce_a),
+        .ce_b           (dsp_y_ce_b),
+        .ce_m           (dsp_y_ce_m),
+        .ce_p           (dsp_y_ce_p),
+        .ce_mode        (dsp_y_ce_mode),
+
+        .mode_z         (dsp_y_mode_z),
+        
+        .a              (dsp_y_a),
+        .b              (dsp_y_b),
+        .p              (dsp_y_p)
+    );
+
+
+    //
+    // FSM State and Next States
+    //
+    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+
+    
+    always @(posedge clk)
+        //
+        if (rst) fsm_state <= FSM_STATE_IDLE;
+        else     fsm_state <= fsm_state_next;
+
+
+    localparam [7:0] index_last = 8'd31;
+    
+
+    wire mult_square_addr_almost_done_comb;
+    reg  mult_square_addr_almost_done_flop;
+    
+    wire mult_square_addr_surely_done_comb;
+    reg  mult_square_addr_surely_done_flop;
+    
+    assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == (index_last - 8'd1);
+    assign mult_square_addr_surely_done_comb = mac_slim_bram_xy_addr == index_last;
+    
+    always @(posedge clk)
+        //
+        case (fsm_state)
+        
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 
+                {mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb};
+                
+            default:
+                {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00;
+            
+        endcase
+
+
+    //
+    // Column
+    //
+    reg  [4:0] col_index;
+    reg  [4:0] col_index_prev;
+    reg  [4:0] col_index_last;
+
+    always @(posedge clk)
+        //
+        col_index_prev <= col_index;
+
+    //
+    // FSM Transition Logic
+    //
+    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+    
+    
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_slim_bram_xy_addr <= 8'd0;
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0;
+            default:                            mac_slim_bram_xy_addr <= 8'dX;
+        endcase
+
+    integer j;
+    always @(posedge clk)
+        //
+        for (j=0; j<(NUM_MULTS/2); j=j+1)
+            case (fsm_state_next)
+                FSM_STATE_MULT_SQUARE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= 1 + 2 * j;
+                FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= 8 * (col_index + 1) + 1 + 2 * j;
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
+                default:                            mac_fat_bram_xy_addr[j] <= 8'dX;
+            endcase
+
+
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_SQUARE_COL_N_INIT,
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_bank <= BANK_SLIM_T1T2;
+            default:                            mac_slim_bram_xy_bank <= 2'bXX;
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_SQUARE_COL_N_INIT,
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_bank <= BANK_FAT_T1T2;
+            default:                            mac_fat_bram_xy_bank <= 3'bXXX;
+        endcase
+
+
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_SQUARE_COL_N_INIT,
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop;
+            default:                            mac_slim_bram_xy_ena <= 1'b0;
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_INIT,
+            FSM_STATE_MULT_SQUARE_COL_N_INIT,
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_ena <= 1'b1;
+            default:                            mac_fat_bram_xy_ena <= 1'b0;
+        endcase
+
+
+    always @(posedge clk)
+        //
+        mac_slim_bram_xy_reg_ena <= mac_slim_bram_xy_ena;
+        
+    always @(posedge clk)
+        //
+        mac_fat_bram_xy_reg_ena <= mac_fat_bram_xy_ena;
+            
+        
+    always @(posedge clk)
+        //
+        if (mac_slim_bram_xy_reg_ena_dly)
+            {dsp_y_b, dsp_x_b} <= {mac_slim_bram_x_dout[16:0], mac_slim_bram_y_dout[16:0]};
+        else
+            {dsp_y_b, dsp_x_b} <= {2{{17{1'bX}}}};
+
+
+    function  [7:0] mac_fat_bram_xy_addr_next;
+        input [7:0] mac_fat_bram_xy_addr_current;
+        input [7:0] mac_fat_bram_xy_addr_last;
+        begin
+            if (mac_fat_bram_xy_addr_current > 0)
+                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1;
+            else
+                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last;
+        end
+    endfunction
+        
+
+    
+    always @(posedge clk)
+        //
+        {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}};
+        
+    always @(posedge clk)
+        //
+        {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}};
+    
+    always @(posedge clk)
+        //
+        {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+    always @(posedge clk)
+        //
+        {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m};
+        
+    always @(posedge clk)
+        //
+        {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+    task wait_clock_tick;
+        begin
+            #`CLK_PERIOD_NS;
+        end
+    endtask
+    
+    //
+    // Increment Logic
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_INIT: begin
+                col_index      <= 5'd0;
+                col_index_last <= index_last[7:3];
+            end
+            //
+            FSM_STATE_MULT_SQUARE_COL_N_INIT:
+                col_index <= col_index + 1'b1;
+            //
+        endcase
+    
+    assign  fsm_state_after_mult_square = (col_index == col_index_last) ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT;
+    
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:   dsp_xy_mode_z_adv4 <= {8{1'b0}};
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly);
+            default:                            dsp_xy_mode_z_adv4 <= {8{1'b1}};
+        endcase
+
+    always @(posedge clk) begin
+        {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}};
+        //
+        dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2};
+        dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3};
+        dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4};
+    end
+    
+    function  [NUM_MULTS-1:0] calc_mac_mode_z_square;
+        input [          4:0] col_index_value;
+        input [          7:0] mac_slim_bram_xy_addr_value;
+        begin
+            if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
+                case (mac_slim_bram_xy_addr_value[2:0])
+                    3'b000: calc_mac_mode_z_square = 8'b11111110;
+                    3'b001: calc_mac_mode_z_square = 8'b11111101;
+                    3'b010: calc_mac_mode_z_square = 8'b11111011;
+                    3'b011: calc_mac_mode_z_square = 8'b11110111;
+                    3'b100: calc_mac_mode_z_square = 8'b11101111;
+                    3'b101: calc_mac_mode_z_square = 8'b11011111;
+                    3'b110: calc_mac_mode_z_square = 8'b10111111;
+                    3'b111: calc_mac_mode_z_square = 8'b01111111;
+                endcase
+            else
+                calc_mac_mode_z_square = {NUM_MULTS{1'b1}};
+        end
+    endfunction
+
+    reg recomb_x_ena = 1'b0;
+    reg recomb_y_ena = 1'b0;
+    
+    always @(posedge clk) begin
+        //
+        recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p;
+        recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p;
+        //
+    end
+    
+    wire [ 2:0] recomb_fat_bram_xy_bank;
+    wire [ 7:0] recomb_fat_bram_xy_addr;
+    wire [17:0] recomb_fat_bram_x_dout;
+    wire [17:0] recomb_fat_bram_y_dout;
+    wire        recomb_fat_bram_xy_dout_valid;
+    wire        recomb_rdy;
+    
+    modexpng_part_recombinator recomb
+    (
+        .clk                        (clk),
+        .rdy                        (recomb_rdy),
+        .fsm_state_next             (fsm_state_next),
+        .index_last                 (index_last),
+        .dsp_x_ce_p                 (dsp_x_ce_p),
+        .dsp_y_ce_p                 (dsp_y_ce_p),
+        .ena_x                      (recomb_x_ena),
+        .ena_y                      (recomb_y_ena),
+        .dsp_x_p                    (dsp_x_p),
+        .dsp_y_p                    (dsp_y_p),
+        .col_index                  (col_index),
+        .col_index_last             (col_index_last),
+        .slim_bram_xy_addr          (mac_slim_bram_xy_addr),
+        .fat_bram_xy_bank           (recomb_fat_bram_xy_bank),
+        .fat_bram_xy_addr           (recomb_fat_bram_xy_addr),
+        .fat_bram_x_dout            (recomb_fat_bram_x_dout),
+        .fat_bram_y_dout            (recomb_fat_bram_y_dout),
+        .fat_bram_xy_dout_valid     (recomb_fat_bram_xy_dout_valid)
+    );
+    
+    reg [17:0] AB_READ[0:63];
+    
+    always @(posedge clk)
+        //
+        if (recomb_fat_bram_xy_dout_valid)
+            //
+            case (recomb_fat_bram_xy_bank)
+                3'd1: AB_READ[recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;
+                3'd2: AB_READ[32 + recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;  
+            endcase
+            
+
+    always @(posedge clk)
+        //
+        if (tb_fat_bram_xy_ena) begin
+            mgr_fat_bram_xy_ena  <= 1'b1;
+            mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank;
+            mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr;
+            mgr_fat_bram_x_din   <= tb_fat_bram_x_din;
+            mgr_fat_bram_y_din   <= tb_fat_bram_y_din;
+        end else if (recomb_fat_bram_xy_dout_valid) begin
+            mgr_fat_bram_xy_ena  <= 1'b1;
+            mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank;
+            mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr;
+            mgr_fat_bram_x_din   <= recomb_fat_bram_x_dout;
+            mgr_fat_bram_y_din   <= recomb_fat_bram_y_dout;
+        end else begin
+            mgr_fat_bram_xy_ena  <= 1'b0;
+            mgr_fat_bram_xy_bank <= 3'bXXX;
+            mgr_fat_bram_xy_addr <= 8'hXX;
+            mgr_fat_bram_x_din   <= {18{1'bX}};
+            mgr_fat_bram_y_din   <= {18{1'bX}};
+        end
+           
+  
+    
+    
+    
+    task verify_ab;
+        reg verify_ab_ok;
+        begin
+            verify_ab_ok = 1;
+            for (i=0; i<64; i=i+1)
+                if (AB_READ[i] === AB[i])
+                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]);
+                else begin
+                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x <???>", i, AB[i], AB_READ[i]);
+                    verify_ab_ok = 0;
+                end
+            if (verify_ab_ok)
+                $display("AB is OK.");
+            else
+                $display("AB is WRONG!");
+        end
+    endtask
+
+
+
+    always @* begin
+        //
+        fsm_state_next = FSM_STATE_IDLE;
+        //
+        case (fsm_state)
+            FSM_STATE_IDLE:                   fsm_state_next = ena                   ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
+                        
+            FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+            
+            FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? fsm_state_after_mult_square    : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+            
+            FSM_STATE_MULT_SQUARE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF;
+            
+            default:                          fsm_state_next =                         FSM_STATE_IDLE                   ;
+
+        endcase
+        //
+    end
+    
+    
+endmodule
+
diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt
new file mode 100644
index 0000000..987bd86
--- /dev/null
+++ b/rtl/dev/temp.txt
@@ -0,0 +1,384 @@
+    //
+    // Helper Functions
+    //
+    /*
+    function  [INDEX_WIDTH-1:0] calc_preset_a_index;
+        input [INDEX_WIDTH-4:0] col_in;
+        input integer           x_in;
+              integer           index_out;
+        begin
+            index_out = col_in * NUM_MULTS + x_in;
+            calc_preset_a_index = index_out[INDEX_WIDTH-1:0];
+        end
+    endfunction
+
+    function  [INDEX_WIDTH-1:0] calc_rotate_a_index;
+        input [INDEX_WIDTH-1:0] current_index_in;
+        input [INDEX_WIDTH-1:0] last_index_in;
+        begin
+            if (current_index_in > {INDEX_WIDTH{1'b0}})
+                calc_rotate_a_index = current_index_in - 1'b1;
+            else
+                calc_rotate_a_index = last_index_in;
+        end
+    endfunction
+    */
+
+ /*
+    //
+    // Narrow Counters
+    //
+    reg        [INDEX_WIDTH-1:0] din_addr_narrow_reg;
+    reg        [INDEX_WIDTH-1:0] din_addr_narrow_dly;
+    localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}};
+    wire       [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ?
+        din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero;
+    wire                         din_addr_narrow_done = din_addr_narrow_reg == index_last;
+    
+    assign din_addr_narrow = din_addr_narrow_reg;
+    
+    always @(posedge clk)
+        //
+        din_addr_narrow_dly <= din_addr_narrow_reg;
+    
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG:   din_addr_narrow_reg <= din_addr_narrow_zero;
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY:   din_addr_narrow_reg <= din_addr_narrow_next;
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:   din_addr_narrow_reg <= din_addr_narrow_zero;
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   din_addr_narrow_reg <= din_addr_narrow_next;
+        endcase
+    
+
+    //
+    // Helper Functions
+    //
+    function  [NUM_MULTS-1:0] calc_mac_clear_bitmask;
+        input [2:0] t;
+        begin
+            case (t)
+                3'd0: calc_mac_clear_bitmask = 8'b00000001;
+                3'd1: calc_mac_clear_bitmask = 8'b00000010;
+                3'd2: calc_mac_clear_bitmask = 8'b00000100;
+                3'd3: calc_mac_clear_bitmask = 8'b00001000;
+                3'd4: calc_mac_clear_bitmask = 8'b00010000;
+                3'd5: calc_mac_clear_bitmask = 8'b00100000;
+                3'd6: calc_mac_clear_bitmask = 8'b01000000;
+                3'd7: calc_mac_clear_bitmask = 8'b10000000;
+            endcase
+        end
+    endfunction
+    
+    function  [NUM_MULTS:0] calc_mac_clear_square;
+        input [INDEX_WIDTH-4:0] current_col_index;
+        input [INDEX_WIDTH-1:0] b_addr_prev;
+        begin
+            if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index)
+                calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])};
+            else
+                calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}};
+        end
+    endfunction
+        
+        
+    //
+    // Wide Counters
+    //
+    reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1];
+ 
+    integer xi;
+    always @(posedge clk)
+        //
+        for (xi=0; xi<NUM_MULTS; xi=xi+1)
+            //
+            case (fsm_state_next)
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(0, xi);
+                FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(col_index + 1'b1, xi);
+                //
+                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+                FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_wide_reg[xi] <= calc_rotate_a_index(din_addr_wide_reg[xi], index_last);
+                //
+            endcase
+        
+ 
+    //
+    // Enables
+    //
+    reg                 din_ena_narrow_reg = 1'b0;
+    reg [NUM_MULTS-1:0] din_ena_wide_reg = {NUM_MULTS{1'b0}};
+    
+    assign din_ena_narrow = din_ena_narrow_reg;
+    assign din_ena_wide   = din_ena_wide_reg;
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) din_ena_narrow_reg <= 1'b0;
+        else case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_narrow_reg <= 1'b1;
+            default:                          din_ena_narrow_reg <= 1'b0;
+        endcase
+
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+        else case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_wide_reg <= {NUM_MULTS{1'b1}};
+            default:                          din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+        endcase
+        
+        
+    //
+    // Modes
+    //
+    reg [2-1:0] din_mode_wide_reg;
+    reg [2-1:0] din_mode_narrow_reg;
+    reg [2-1:0] dout_mode_wide_reg;
+    reg [2-1:0] dout_mode_narrow_reg;
+
+    assign din_mode_wide = din_mode_wide_reg;
+    assign din_mode_narrow = din_mode_narrow_reg;
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_wide_reg <= MODEXPNG_MODE_A;
+            default:                          din_mode_wide_reg <= 2'bXX;
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_narrow_reg <= MODEXPNG_MODE_B;
+            default:                          din_mode_narrow_reg <= 2'bXX;
+        endcase
+
+
+    //
+    // MAC Array
+    //
+    wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_a[0:NUM_MULTS];
+    wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_b;
+    reg  [          NUM_MULTS  :0] mac_ce;
+    reg  [          NUM_MULTS  :0] mac_clr;
+    wire [ MODEXPNG_MAC_WIDTH-1:0] mac_p[0:NUM_MULTS];
+    reg  [          NUM_MULTS  :0] mac_rdy_lsb;
+    reg  [          NUM_MULTS  :0] mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1:0];
+    
+    //reg  [          NUM_MULTS  :0] mac_ce_dly[MODEXPNG_MAC_LATENCY-1:0];
+    //wire [          NUM_MULTS  :0] mac_rdy;
+
+
+
+
+
+    assign mac_din_b = din_narrow;
+    
+    
+    genvar x;
+    generate for (x=0; x<=NUM_MULTS; x=x+1)
+        begin : gen_macs
+            //
+            //assign mac_rdy[x] = mac_ce_dly[MODEXPNG_MAC_LATENCY-1][x];
+            //
+            modexpng_mac mac_inst
+            (
+                .clk    (clk),
+                .ce     (mac_ce[x]),
+                .clr    (mac_clr[x]),
+                .a      (mac_din_a[x]),
+                .b      (mac_din_b),
+                .p      (mac_p[x])
+            );
+            //
+        end
+        //
+    endgenerate
+    
+    generate for (x=0; x<NUM_MULTS; x=x+1)
+        begin : gen_mac_din_a
+            //
+            assign mac_din_a[x] = din_wide[x*MODEXPNG_WORD_WIDTH+:MODEXPNG_WORD_WIDTH];
+            //
+        end
+    endgenerate
+
+    generate for (x=0; x<NUM_MULTS; x=x+1)
+        begin : gen_din_addr_wide
+            //
+            assign din_addr_wide[x*INDEX_WIDTH+:INDEX_WIDTH] = din_addr_wide_reg[x];
+            //
+        end
+    endgenerate
+ 
+ 
+    //
+    // MAC Clock Enable Logic
+    //
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+        else case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_ce <= {1'b0, {NUM_MULTS{1'b1}}};
+            default:                          mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+        endcase
+    
+    
+    //
+    // MAC Valid Logic
+    //
+    integer y;
+
+    always @(posedge clk)
+        //
+        for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+            mac_rdy_lsb_dly[0][xi] <= mac_rdy_lsb[xi];
+            for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+                mac_rdy_lsb_dly[y][xi] <= mac_rdy_lsb_dly[y-1][xi];
+        end
+
+    always @(posedge clk) begin
+        //
+        fsm_state_dly[0] <= fsm_state;
+        for (y=1; y<=MODEXPNG_MAC_LATENCY; y=y+1)
+            fsm_state_dly[y] <= fsm_state_dly[y-1];
+    end
+
+    */
+
+    /*
+    always @(posedge clk)
+        //
+        for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+            mac_ce_dly[0][xi] <= mac_ce[xi];
+            for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+                mac_ce_dly[y][xi] <= mac_ce_dly[y-1][xi];
+        end
+    */
+    /*
+    always @(posedge clk)
+        //
+        for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+            mac_clr_dly[0][xi] <= mac_clr[xi];
+            for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+                mac_clr_dly[y][xi] <= mac_clr_dly[y-1][xi];
+        end
+    */
+
+  /*  
+    //
+    // MAC Clear Logic
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_clr <= {1'b0, {NUM_MULTS{1'b1}}};
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_clr <= calc_mac_clear_square(col_index, din_addr_narrow_dly);
+            default:                          mac_clr <= {1'bX, {NUM_MULTS{1'bX}}};
+        endcase
+
+
+    //
+    // MAC Ready Logic
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_rdy_lsb <= calc_mac_clear_square(col_index, din_addr_narrow);
+            default:                          mac_rdy_lsb <= {1'bX, {NUM_MULTS{1'bX}}};
+        endcase
+ 
+ 
+ 
+    //
+    // Recombinators
+    //
+    reg          rcmb_lsb_ce;
+    reg          rcmb_lsb_clr;
+    reg  [MODEXPNG_MAC_WIDTH-1: 0] rcmb_lsb_din;
+    wire [15: 0] rcmb_lsb_dout;
+    
+    modexpng_part_recombinator recomb_lsb
+    (
+        .clk    (clk),
+        .ce     (rcmb_lsb_ce),
+        .clr    (rcmb_lsb_clr),
+        .din    (rcmb_lsb_din),
+        .dout   (rcmb_lsb_dout)
+    );
+ 
+ 
+    reg calc_rcmb_lsb_ce;
+    always @*
+        //
+        calc_rcmb_lsb_ce = | mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0];
+
+    reg [MODEXPNG_MAC_WIDTH-1:0] calc_rcmb_lsb_din;
+    
+    always @*
+        //
+        casez (mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0])
+            8'b00000001: calc_rcmb_lsb_din = mac_p[0];
+            8'b00000010: calc_rcmb_lsb_din = mac_p[1];
+            8'b00000100: calc_rcmb_lsb_din = mac_p[2];
+            8'b00001000: calc_rcmb_lsb_din = mac_p[3];
+            8'b00010000: calc_rcmb_lsb_din = mac_p[4];
+            8'b00100000: calc_rcmb_lsb_din = mac_p[5];
+            8'b01000000: calc_rcmb_lsb_din = mac_p[6];
+            8'b10000000: calc_rcmb_lsb_din = mac_p[7];
+            default:     calc_rcmb_lsb_din = {MODEXPNG_MAC_WIDTH{1'bX}};
+        endcase
+ 
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0)
+            rcmb_lsb_ce <= 1'b0;
+        else case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_ce <= calc_rcmb_lsb_ce;
+            default:                          rcmb_lsb_ce <= 1'b0;
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: rcmb_lsb_clr <= 1'b1;
+            default:                          rcmb_lsb_clr <= 1'b0;
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_din <= calc_rcmb_lsb_din;
+            default:                          rcmb_lsb_din <= {MODEXPNG_MAC_WIDTH{1'bX}};
+        endcase
+ 
+ 
+
+*/
diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v
new file mode 100644
index 0000000..178f87f
--- /dev/null
+++ b/rtl/dsp/dsp_array.v
@@ -0,0 +1,111 @@
+module dsp_array
+(
+    input             clk,
+    
+    input             ce_a,
+    input             ce_b,
+    input             ce_m,
+    input             ce_p,
+    input             ce_mode,
+
+    input  [8   -1:0] mode_z,
+    
+    input  [4*18-1:0] a,
+    input  [1*17-1:0] b,
+    output [8*47-1:0] p
+);
+
+    `include "../modexpng_parameters_x8.vh"
+    
+    wire [17:0] casc_a[0:3];
+    wire [16:0] casc_b[0:3];
+    
+    wire ce_a0 = ce_a;
+    reg  ce_a1 = 1'b0;
+    reg  ce_a2 = 1'b0;
+    
+    wire ce_b0 = ce_b;
+    reg  ce_b1 = 1'b0;
+    
+    always @(posedge clk) begin
+        ce_a1 <= ce_a0;
+        ce_a2 <= ce_a1;
+        ce_b1 <= ce_b0;
+    end
+        
+    
+    genvar z;
+    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+        //
+        begin : DSP48E1
+            //        
+            dsp_slice #
+            (
+                .AB_INPUT("DIRECT"),
+                .B_REG(2)
+            )
+            dsp_direct
+            (
+                .clk            (clk),
+                
+                .ce_a1          (ce_a0),
+                .ce_b1          (ce_b0),
+                .ce_a2          (ce_a1),
+                .ce_b2          (ce_b1),
+                .ce_m           (ce_m),
+                .ce_p           (ce_p),
+                .ce_mode        (ce_mode),
+                
+                .a              (a[z*18+:18]),
+                .b              (b),
+                .p              (p[47*2*z+:47]),
+                
+                .inmode         (5'b00000),
+                .opmode         ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}),
+                .alumode        (4'b0000),
+                
+                .casc_a_in      ({17{1'b0}}),
+                .casc_b_in      ({17{1'b0}}),
+                
+                .casc_a_out     (casc_a[z]),
+                .casc_b_out     (casc_b[z])
+            );
+            //
+            dsp_slice #
+            (
+                .AB_INPUT("CASCADE"),
+                .B_REG(1)
+            )
+            dsp_cascade
+            (
+                .clk            (clk),
+                
+                .ce_a1          (ce_a1),
+                .ce_b1          (1'b0),
+                .ce_a2          (ce_a2),
+                .ce_b2          (ce_b1),
+                .ce_m           (ce_m),
+                .ce_p           (ce_p),
+                .ce_mode        (ce_mode),
+                
+                .a              (a[z*18+:18]),
+                .b              (b),
+                .p              (p[47*(2*z+1)+:47]),
+                
+                .inmode         (5'b00000),
+                .opmode         ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}),
+                .alumode        (4'b0000),
+                
+                .casc_a_in      (casc_a[z]),
+                .casc_b_in      (casc_b[z]),
+                
+                .casc_a_out     (),
+                .casc_b_out     ()
+            );
+            //            
+        end
+        //
+    endgenerate
+
+
+endmodule
diff --git a/rtl/dsp/dsp_slice.v b/rtl/dsp/dsp_slice.v
new file mode 100644
index 0000000..9f1298b
--- /dev/null
+++ b/rtl/dsp/dsp_slice.v
@@ -0,0 +1,125 @@
+module dsp_slice #
+(
+    AB_INPUT   = "DIRECT",
+    B_REG      = 2
+)
+(
+    input         clk,
+    input         ce_a1,
+    input         ce_b1,
+    input         ce_a2,
+    input         ce_b2,
+    input         ce_m,
+    input         ce_p,
+    input         ce_mode,
+    input  [17:0] a,
+    input  [16:0] b,
+    output [46:0] p,
+    input  [ 4:0] inmode,
+    input  [ 6:0] opmode,
+    input  [ 3:0] alumode,
+    input  [17:0] casc_a_in,
+    input  [16:0] casc_b_in,
+    output [17:0] casc_a_out,
+    output [16:0] casc_b_out
+);
+
+    wire [30-18-1:0] casc_a_dummy;
+    wire [18-17-1:0] casc_b_dummy;
+    wire [48-47-1:0] p_dummy;
+    
+    DSP48E1 #
+    (
+        .AREG                   (2),
+        .BREG                   (B_REG),
+        .CREG                   (0),
+        .DREG                   (0),
+        .ADREG                  (0),
+        .MREG                   (1),
+        .PREG                   (1),
+        .ACASCREG               (1),
+        .BCASCREG               (1),
+        .INMODEREG              (0),
+        .OPMODEREG              (1),
+        .ALUMODEREG             (0),
+        .CARRYINREG             (0),
+        .CARRYINSELREG          (0),
+
+        .A_INPUT                (AB_INPUT),
+        .B_INPUT                (AB_INPUT),
+
+        .USE_DPORT              ("FALSE"),
+        .USE_MULT               ("DYNAMIC"),
+        .USE_SIMD               ("ONE48"),
+
+        .MASK                   (48'h3fffffffffff),
+        .PATTERN                (48'h000000000000),
+        .SEL_MASK               ("MASK"),
+        .SEL_PATTERN            ("PATTERN"),
+        
+        .USE_PATTERN_DETECT     ("NO_PATDET"),
+        .AUTORESET_PATDET       ("NO_RESET")
+    )
+    DSP48E1_inst
+    (
+        .CLK                (clk),
+    
+        .CEA1               (ce_a1),
+        .CEB1               (ce_b1),
+        .CEA2               (ce_a2),
+        .CEB2               (ce_b2),
+        .CEAD               (1'b0),
+        .CEC                (1'b0),
+        .CED                (1'b0),
+        .CEM                (ce_m),
+        .CEP                (ce_p),
+        .CEINMODE           (1'b0),
+        .CECTRL             (ce_mode),
+        .CEALUMODE          (1'b0),
+        .CECARRYIN          (1'b0),
+
+        .A                  ({{(30-18){1'b0}}, a}),
+        .B                  ({{(18-17){1'b0}}, b}),
+        .C                  ({48{1'b0}}),
+        .D                  ({25{1'b0}}),
+        .P                  ({p_dummy, p}),
+        
+        .INMODE             (inmode),
+        .OPMODE             (opmode),
+        .ALUMODE            (alumode),
+
+        .ACIN               ({{(30-18){1'b0}}, casc_a_in}),
+        .BCIN               ({{(18-17){1'b0}}, casc_b_in}),
+        .ACOUT              ({casc_a_dummy, casc_a_out}),
+        .BCOUT              ({casc_b_dummy, casc_b_out}),
+        .PCIN               ({48{1'b0}}),
+        .PCOUT              (),
+        .CARRYCASCIN        (1'b0),
+        .CARRYCASCOUT       (),
+ 
+        .RSTA               (1'b0),
+        .RSTB               (1'b0),
+        .RSTC               (1'b0),
+        .RSTD               (1'b0),
+        .RSTM               (1'b0),
+        .RSTP               (1'b0),
+        .RSTINMODE          (1'b0),
+        .RSTCTRL            (1'b0),
+        .RSTALUMODE         (1'b0),
+        .RSTALLCARRYIN      (1'b0),
+
+        .UNDERFLOW          (),
+        .OVERFLOW           (),
+        .PATTERNDETECT      (),
+        .PATTERNBDETECT     (),
+
+        .CARRYIN            (1'b0),
+        .CARRYOUT           (),
+        .CARRYINSEL         (3'b000),
+
+        .MULTSIGNIN         (1'b0),
+        .MULTSIGNOUT        ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v
new file mode 100644
index 0000000..9105dab
--- /dev/null
+++ b/rtl/modexpng_mac.v
@@ -0,0 +1,54 @@
+module modexpng_mac
+(
+    clk,
+    ce, clr,
+    casc_a,
+    a_in, b_in, p_out,
+    a_casc_in, a_casc_out
+);
+
+    input           clk;
+    input           ce;
+    input           clr;
+    input           casc_a;
+    input   [16:0]  a_in;
+    input   [16:0]  b_in;
+    output  [46:0]  p_out;
+    input   [16:0]  a_casc_in;
+    output  [16:0]  a_casc_out;
+    
+    reg [16:0] a_reg;
+    reg [16:0] b_reg;
+    assign a_casc_out = a_reg;
+    always @(posedge clk)
+        //
+        if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in};
+        
+    reg ce_dly1;
+    reg ce_dly2;
+    always @(posedge clk)
+        //
+        {ce_dly2, ce_dly1} <= {ce_dly1, ce};
+        
+    reg clr_dly1;
+    reg clr_dly2;
+    always @(posedge clk) begin
+        //
+        if (ce)      clr_dly1 <= clr;
+        if (ce_dly1) clr_dly2 <= clr_dly1;
+        //
+    end        
+        
+    reg  [33:0] m_reg;
+    wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg};
+    always @(posedge clk)
+        //
+        if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg};
+        
+    reg [46:0] p_reg;
+    assign p_out = p_reg;
+    always @(posedge clk)
+        //
+        if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext;
+ 
+endmodule
diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v
new file mode 100644
index 0000000..067929e
--- /dev/null
+++ b/rtl/modexpng_mac_array.v
@@ -0,0 +1,116 @@
+module modexpng_mac_array
+(
+    clk,
+    ce, clr,
+    ce_aux, clr_aux,
+    casc_a, casc_a_aux,
+    a_in, b_in, p_out,
+    a_in_aux, p_out_aux
+);
+    
+    
+    //
+    // Includes
+    //
+    `include "modexpng_parameters.vh"
+    `include "modexpng_parameters_x8.vh"
+
+    
+    //
+    // Ports
+    //
+    input                                clk;
+    input                                ce;
+    input  [NUM_MULTS              -1:0] clr;
+    input                                ce_aux;
+    input                                clr_aux;
+    input  [NUM_MULTS              -2:0] casc_a;
+    input                                casc_a_aux;
+    input  [NUM_MULTS * WORD_WIDTH -1:0] a_in;
+    input  [        1 * WORD_WIDTH -1:0] b_in;
+    output [NUM_MULTS * MAC_WIDTH  -1:0] p_out;
+    input  [        1 * WORD_WIDTH -1:0] a_in_aux;
+    output [        1 * MAC_WIDTH  -1:0] p_out_aux;
+  
+
+    //
+    // A-Cascade Paths
+    //
+    wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2];
+    wire [WORD_WIDTH-1:0] a_casc_int_aux;
+    
+
+    //
+    // LSB
+    //
+    modexpng_mac mac_lsb
+    (
+        .clk        (clk),
+        .ce         (ce),
+        .clr        (clr[0]),
+        .casc_a     (1'b0),
+        .a_in       (a_in[0+:WORD_WIDTH]),
+        .b_in       (b_in),
+        .p_out      (p_out[0+:MAC_WIDTH]),
+        .a_casc_in  ({WORD_WIDTH{1'b0}}),
+        .a_casc_out (a_casc_int[0])
+    );
+    
+    
+    //
+    // INT
+    //
+    genvar z;
+    generate for (z=1; z<(NUM_MULTS-1); z=z+1)
+        begin : gen_modexpng_mac_int
+            modexpng_mac mac_int
+            (
+                .clk        (clk),
+                .ce         (ce),
+                .clr        (clr[z]),
+                .casc_a     (casc_a[z-1]),
+                .a_in       (a_in[z*WORD_WIDTH+:WORD_WIDTH]),
+                .b_in       (b_in),
+                .p_out      (p_out[z*MAC_WIDTH+:MAC_WIDTH]),
+                .a_casc_in  (a_casc_int[z-1]),
+                .a_casc_out (a_casc_int[z])
+            );        
+        end
+    endgenerate
+    
+    
+    //
+    // MSB
+    //
+    modexpng_mac mac_msb
+    (
+        .clk        (clk),
+        .ce         (ce),
+        .clr        (clr[NUM_MULTS-1]),
+        .casc_a     (casc_a[NUM_MULTS-2]),
+        .a_in       (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]),
+        .b_in       (b_in),
+        .p_out      (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]),
+        .a_casc_in  (a_casc_int[NUM_MULTS-2]),
+        .a_casc_out (a_casc_int_aux)
+    );
+
+    
+    //
+    // AUX
+    //
+    modexpng_mac mac_aux
+    (
+        .clk        (clk),
+        .ce         (ce_aux),
+        .clr        (clr_aux),
+        .casc_a     (casc_a_aux),
+        .a_in       (a_in_aux),
+        .b_in       (b_in),
+        .p_out      (p_out_aux),
+        .a_casc_in  (a_casc_int_aux),
+        .a_casc_out ()
+    );
+
+    
+endmodule
diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v
new file mode 100644
index 0000000..ca89214
--- /dev/null
+++ b/rtl/modexpng_mem.v
@@ -0,0 +1,93 @@
+//
+// TODO: Add license text!
+//
+
+module modexpng_mem #
+(
+    parameter MEM_WIDTH     = 17,
+    parameter MEM_ADDR_BITS =  6
+)
+(
+    input   clk,
+
+    input  [MEM_ADDR_BITS-1:0] a_addr,
+    input                      a_en,
+    input                      a_wr,
+    input  [MEM_WIDTH    -1:0] a_in,
+    output [MEM_WIDTH    -1:0] a_out,
+
+    input  [MEM_ADDR_BITS-1:0] b_addr,
+    input                      b_en,
+    input                      b_reg_en,
+    output [MEM_WIDTH    -1:0] b_out
+);
+
+
+    //
+    // BRAM
+    //
+    (* RAM_STYLE="BLOCK" *)
+    reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+    //
+    // Initialization for Simulation
+    //
+    /*
+    integer c;
+    initial begin
+        for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
+        bram[c] = {MEM_WIDTH{1'b0}};
+    end
+    */
+
+
+
+    //
+    // Output Registers
+    //
+    reg [MEM_WIDTH-1:0] bram_b;
+    reg [MEM_WIDTH-1:0] bram_b_reg;
+
+    assign a_out = 32'hDEADCE11;
+    assign b_out = bram_b_reg;
+
+    
+    //
+    // Note, that when both ports are accessing the same location, conflict can
+    // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict
+    // Avoidance") for more information. In our configuration to avoid that the
+    // write port must be coded to operate in READ_FIRST mode. If the write
+    // port is overwriting the same address the read port is accessing, the 
+    // write port must read the previously stored data (not the data it is
+    // writing, as that would be WRITE_FIRST mode).
+    //
+
+
+    //
+    // Write-Only Port A
+    //
+    always @(posedge clk)
+        //
+        if (a_en)
+            //
+            if (a_wr) bram[a_addr] <= a_in;
+
+
+    //
+    // Read-Only Port B
+    //
+    always @(posedge clk)
+        //
+        if (b_en)
+            //
+            bram_b <= bram[b_addr];
+
+    always @(posedge clk)
+        //
+        if (b_reg_en)
+            //
+            bram_b_reg <= bram_b;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v
new file mode 100644
index 0000000..b904795
--- /dev/null
+++ b/rtl/modexpng_mmm_col_index.v
@@ -0,0 +1,90 @@
+module modexpng_mmm_col_index
+(
+    clk,
+    index_last,
+    fsm_state_next,
+    col_index,
+    col_index_done,
+    col_index_zero,
+    col_index_next,
+    col_index_prev
+);
+
+
+    //
+    // Includes
+    //
+    //`include "modexpng_parameters.vh"
+    //`include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+    
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+
+    
+    //
+    // Ports
+    //
+    input                        clk;
+    input  [    INDEX_WIDTH-1:0] index_last;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    output [    INDEX_WIDTH-4:0] col_index;
+    output                       col_index_done;
+    output [    INDEX_WIDTH-4:0] col_index_zero;
+    output [    INDEX_WIDTH-4:0] col_index_next;
+    output [    INDEX_WIDTH-4:0] col_index_prev;
+    
+
+    //
+    // Registers
+    //
+    reg  [INDEX_WIDTH-4:0] col_index_reg;
+    reg  [INDEX_WIDTH-4:0] col_index_last;
+    reg  [INDEX_WIDTH-4:0] col_index_dly;
+    
+    
+    //
+    // Mapping
+    //
+    assign col_index = col_index_reg;
+    assign col_index_prev = col_index_dly;
+    
+    
+    //
+    // Handy Wires
+    //
+    assign col_index_done = col_index == col_index_last;
+    assign col_index_zero = {(INDEX_WIDTH-3){1'b0}};
+    assign col_index_next = col_index + 1'b1;
+ 
+ 
+    //
+    // Increment Logic
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+                col_index_reg  <= col_index_zero;
+                col_index_last <= index_last[INDEX_WIDTH-1:3];
+            end
+            //
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+                col_index_reg <= col_index_next;
+            //
+        endcase
+      
+      
+    //
+    // Delay Logic
+    //
+    always @(posedge clk)
+        //
+        col_index_dly <= col_index;
+    
+    
+endmodule
diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v
new file mode 100644
index 0000000..565c7e0
--- /dev/null
+++ b/rtl/modexpng_mmm_din_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_din_addr
+(
+    clk, rst_n,
+    index_last,
+    fsm_state_next,
+    col_index_zero, col_index_next,
+    din_addr, din_bank, din_ena, din_reg_ena,
+    din_addr_cnt, din_addr_cnt_last,
+    din_addr_cnt_lower_prev, din_addr_cnt_upper_prev
+);
+
+
+    //
+    // Includes
+    //
+    `include "modexpng_parameters.vh"
+    //`include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+    
+    
+    //
+    // Ports
+    //
+    input                        clk;
+    input                        rst_n;
+    input  [    INDEX_WIDTH-1:0] index_last;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    input  [    INDEX_WIDTH-4:0] col_index_zero;
+    input  [    INDEX_WIDTH-4:0] col_index_next;
+    output [    INDEX_WIDTH-4:0] din_addr;
+    output [              3-1:0] din_bank;
+    output [              1-1:0] din_ena;
+    output [              1-1:0] din_reg_ena;
+    output [    INDEX_WIDTH-1:0] din_addr_cnt;
+    output [    INDEX_WIDTH-1:0] din_addr_cnt_last;
+    output [              3-1:0] din_addr_cnt_lower_prev;
+    output [    INDEX_WIDTH-4:0] din_addr_cnt_upper_prev;
+    
+ 
+    //
+    // Address
+    //
+    reg  [INDEX_WIDTH-1:0] din_addr_reg;
+    wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}};
+    reg  [INDEX_WIDTH-1:0] din_addr_last;
+    wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1;
+    
+    reg  [INDEX_WIDTH-1:0] din_addr_cnt_reg;
+    wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}};
+    wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1;
+    reg  [INDEX_WIDTH-1:0] din_addr_cnt_last_reg;
+    wire [          3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[          3-1:0];
+    wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3];
+    reg  [          3-1:0] din_addr_cnt_lower_dly;
+    reg  [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly;
+
+    reg  [          3-1:0] din_bank_reg;
+
+
+    //
+    // Enables
+    //
+    reg din_ena_reg = 1'b0;
+    reg din_reg_ena_reg = 1'b0;
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n)
+            din_ena_reg <= 1'b0;
+        else case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                din_ena_reg <= 1'b1;
+            //
+            default:
+                din_ena_reg <= 1'b0;
+            //
+        endcase
+
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n)
+            din_reg_ena_reg <= 1'b0;
+        else
+            din_reg_ena_reg <= din_ena_reg;
+
+
+    //
+    // Address Mapping
+    //
+    assign din_addr                = din_addr_reg[INDEX_WIDTH-1:3];
+
+    assign din_addr_cnt            = din_addr_cnt_reg;
+    assign din_addr_cnt_last       = din_addr_cnt_last_reg;
+    assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly;
+    assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly;
+
+    assign din_bank                = din_bank_reg;
+    
+    
+    //
+    // Enable Mapping
+    //
+    assign din_ena = din_ena_reg;
+    assign din_reg_ena = din_reg_ena_reg;
+
+
+    //
+    // Delay
+    //
+    always @(posedge clk) begin
+        din_addr_cnt_lower_dly <= din_addr_cnt_lower;
+        din_addr_cnt_upper_dly <= din_addr_cnt_upper;
+    end
+
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+                din_addr_reg          <= {col_index_zero, {3{1'b0}}};
+                din_addr_last         <= index_last;
+                din_addr_cnt_reg      <= din_addr_cnt_zero;
+                din_addr_cnt_last_reg <= index_last;
+            end
+            //
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin
+                din_addr_reg     <= {col_index_next, {3{1'b0}}};
+                din_addr_cnt_reg <= din_addr_cnt_zero;
+            end
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+                din_addr_reg     <= din_addr_prev;
+                din_addr_cnt_reg <= din_addr_cnt_next;
+            end
+            //
+            //default:
+            //
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                din_bank_reg = BANK_XY_T1T2;
+            //
+            default:
+                din_bank_reg = BANK_XY_ANY;
+            //
+        endcase
+        
+endmodule
diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v
new file mode 100644
index 0000000..3749d82
--- /dev/null
+++ b/rtl/modexpng_mmm_dout_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_dout_addr
+(
+    clk, rst_n,
+    //index_last,
+    fsm_state,
+    load_xy_addr,
+    load_addr_zero,
+    load_nn_coeff_addr_done,
+    /*
+    
+    col_index_zero, col_index_next,*/
+    x_dout_addr, y_dout_addr,
+    x_dout_ena,  y_dout_ena,
+    x_dout_bank, y_dout_bank
+
+);
+
+
+    //
+    // Includes
+    //
+    `include "modexpng_parameters.vh"
+    `include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+    
+    
+    //
+    // Ports
+    //
+    input                        clk;
+    input                        rst_n;
+    //input  [    INDEX_WIDTH-1:0] index_last;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state;
+    input  [INDEX_WIDTH:0] load_xy_addr;       // address
+    input                  load_addr_zero;
+    input                  load_nn_coeff_addr_done;
+    //input  [    INDEX_WIDTH-4:0] col_index_zero;
+    //input  [    INDEX_WIDTH-4:0] col_index_next;
+    output [INDEX_WIDTH-4:0] x_dout_addr;
+    output [INDEX_WIDTH-4:0] y_dout_addr;
+    
+    output [NUM_MULTS-1:0] x_dout_ena;
+    output [NUM_MULTS-1:0] y_dout_ena;
+    
+    output [3-1:0] x_dout_bank;
+    output [3-1:0] y_dout_bank;
+    
+ 
+    //
+    // Registers
+    //
+    reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2
+    reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2
+    
+    reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}};
+    reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}};
+
+    reg [NUM_MULTS-1:0] x_dout_ena_int;
+    reg [NUM_MULTS-1:0] y_dout_ena_int;
+    
+    reg [3-1:0] x_dout_bank_reg;
+    reg [3-1:0] y_dout_bank_reg;
+
+    
+    //
+    // Mapping
+    //
+    assign x_dout_addr = x_dout_addr_reg;
+    assign y_dout_addr = y_dout_addr_reg;
+    
+    assign x_dout_ena  = x_dout_ena_reg;
+    assign y_dout_ena  = y_dout_ena_reg;
+    
+    assign x_dout_bank = x_dout_bank_reg;
+    assign y_dout_bank = y_dout_bank_reg;
+
+    
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3: begin
+                x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+                y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+            end
+            //
+            FSM_STATE_LOAD_NN_COEFF_3: begin
+                x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+                y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+            end
+            //
+            default: begin
+                x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+                y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+            end
+            //
+        endcase
+
+    wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1};        
+    
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_2: begin
+                x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]};
+                y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+            end
+            //
+            FSM_STATE_LOAD_NN_COEFF_2: begin
+                x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done};
+                y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+            end
+            //
+        endcase
+
+
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n) begin
+            x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+            y_dout_ena_reg <= {NUM_MULTS{1'b0}};        
+        end else case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3,
+            FSM_STATE_LOAD_NN_COEFF_3: begin
+                x_dout_ena_reg <= x_dout_ena_int;
+                y_dout_ena_reg <= y_dout_ena_int;
+            end
+            //
+            default: begin
+                x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+                y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+            end
+            //
+        endcase
+
+        
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3: begin
+                x_dout_bank_reg <= BANK_X_T1;
+                y_dout_bank_reg <= BANK_Y_T2;
+            end
+            //
+            FSM_STATE_LOAD_NN_COEFF_3: begin
+                x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N       : BANK_XY_AUX;
+                y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX;
+            end
+            //
+            default: begin
+                x_dout_bank_reg <= BANK_XY_ANY;
+                y_dout_bank_reg <= BANK_XY_ANY;
+            end
+            //
+        endcase
+
+        
+endmodule
diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh
new file mode 100644
index 0000000..c237a0b
--- /dev/null
+++ b/rtl/modexpng_mmm_fsm.vh
@@ -0,0 +1,24 @@
+localparam FSM_STATE_WIDTH = 32;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_IDLE = 0;
+    
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_1 = 1;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_2 = 2;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_3 = 3;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_1 = 4;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_2 = 5;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_3 = 6;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_INIT = 11;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_TRIG = 12;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_BUSY = 13;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_INIT = 14;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16;
+    
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17;
+    
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999;
+    
\ No newline at end of file
diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v
new file mode 100644
index 0000000..a2a21ff
--- /dev/null
+++ b/rtl/modexpng_mmm_pad.v
@@ -0,0 +1,153 @@
+module modexpng_mmm_pad
+(
+    clk, rst_n,
+    fsm_state,
+    load_xy_addr_lsb,
+    pad_x_rd_addr, pad_y_rd_addr,
+    pad_x_rd_ena,  pad_y_rd_ena,
+    pad_x_rd_dout, pad_y_rd_dout,
+    load_x_din,    load_y_din
+);
+
+
+    //
+    // Includes
+    //
+    `include "modexpng_parameters.vh"
+    //`include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+    
+    
+    //
+    // Ports
+    //
+    input                        clk;
+    input                        rst_n;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state;
+
+    input [INDEX_WIDTH-1:0] load_xy_addr_lsb;
+    
+    input [WORD_WIDTH-1:0] load_x_din;
+    input [WORD_WIDTH-1:0] load_y_din;
+    
+    input [INDEX_WIDTH-1:0] pad_x_rd_addr;
+    input [INDEX_WIDTH-1:0] pad_y_rd_addr;
+    
+    input                   pad_x_rd_ena;
+    input                   pad_y_rd_ena;
+    
+    output [WORD_WIDTH-1:0] pad_x_rd_dout;
+    output [WORD_WIDTH-1:0] pad_y_rd_dout;
+    
+ 
+    //
+    // Registers
+    //
+    reg [INDEX_WIDTH-1:0] pad_x_wr_addr;
+    reg [INDEX_WIDTH-1:0] pad_y_wr_addr;
+    reg                   pad_x_wr_ena;
+    reg                   pad_y_wr_ena;
+    reg [ WORD_WIDTH-1:0] pad_x_wr_din;
+    reg [ WORD_WIDTH-1:0] pad_y_wr_din;
+        
+    bram_1wo_1ro_readfirst_ce #
+    (
+        .MEM_WIDTH      (WORD_WIDTH),
+        .MEM_ADDR_BITS  (INDEX_WIDTH)
+    )
+    pad_x
+    (
+        .clk        (clk),
+
+        .a_addr     (pad_x_wr_addr),
+        .a_en       (pad_x_wr_ena),
+        .a_wr       (pad_x_wr_ena),
+        .a_in       (pad_x_wr_din),
+        .a_out      (), // unused
+
+        .b_addr     (pad_x_rd_addr),
+        .b_en       (pad_x_rd_ena),
+        .b_out      (pad_x_rd_dout)
+    );    
+
+    bram_1wo_1ro_readfirst_ce #
+    (
+        .MEM_WIDTH      (WORD_WIDTH),
+        .MEM_ADDR_BITS  (INDEX_WIDTH)
+    )
+    pad_y
+    (
+        .clk        (clk),
+
+        .a_addr     (pad_y_wr_addr),
+        .a_en       (pad_y_wr_ena),
+        .a_wr       (pad_y_wr_ena),
+        .a_in       (pad_y_wr_din),
+        .a_out      (), // unused
+
+        .b_addr     (pad_y_rd_addr),
+        .b_en       (pad_y_rd_ena),
+        .b_out      (pad_y_rd_dout)
+    );    
+  
+
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3: begin
+                pad_x_wr_addr <= load_xy_addr_lsb;
+                pad_y_wr_addr <= load_xy_addr_lsb;
+            end
+            //
+            default: begin
+                pad_x_wr_addr <= {INDEX_WIDTH{1'bX}};
+                pad_y_wr_addr <= {INDEX_WIDTH{1'bX}};
+            end
+            //
+        endcase
+
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3: begin
+                pad_x_wr_din <= load_x_din;
+                pad_y_wr_din <= load_y_din;
+            end
+            //
+            default: begin
+                pad_x_wr_din <= load_x_din;
+                pad_y_wr_din <= load_y_din;
+            end
+            //
+        endcase
+        
+  
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n) begin
+            pad_x_wr_ena <= 1'b0;
+            pad_y_wr_ena <= 1'b0;
+        end else case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3: begin
+                pad_x_wr_ena <= 1'b1;
+                pad_y_wr_ena <= 1'b1;
+            end
+            //
+            default: begin
+                pad_x_wr_ena <= 1'b0;
+                pad_y_wr_ena <= 1'b0;
+            end
+            //
+        endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v
new file mode 100644
index 0000000..a8f309a
--- /dev/null
+++ b/rtl/modexpng_mmm_transporter.v
@@ -0,0 +1,157 @@
+module modexpng_mmm_transporter
+(
+    clk,
+    ena,
+    index_last,
+    fsm_state,
+    fsm_state_next,
+    load_phase,
+    load_xy_addr,
+    load_xy_addr_vld,
+    load_xy_req,
+    load_addr_zero,
+    load_t1t2_addr_done,
+    load_nn_coeff_addr_done
+);
+
+
+    //
+    // Includes
+    //
+    //`include "modexpng_parameters.vh"
+    //`include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+
+
+    //
+    // Ports
+    //
+    input                        clk;
+    input                        ena;
+    input  [    INDEX_WIDTH-1:0] index_last;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    output                       load_phase;
+    output [      INDEX_WIDTH:0] load_xy_addr;
+    output                       load_xy_addr_vld;
+    output                       load_xy_req;
+    output                       load_addr_zero;
+    output                       load_t1t2_addr_done;
+    output                       load_nn_coeff_addr_done;
+    
+
+    //
+    // Load Address Generator
+    //
+    reg                 load_phase_reg;
+    reg [INDEX_WIDTH:0] load_xy_addr_reg;
+    reg                 load_xy_addr_vld_reg;
+    reg                 load_xy_req_reg;
+
+    
+    //
+    // Mapping
+    //
+    assign load_phase       = load_phase_reg;
+    assign load_xy_addr     = load_xy_addr_reg;
+    assign load_xy_addr_vld = load_xy_addr_vld_reg;
+    assign load_xy_req      = load_xy_req_reg;
+
+    
+    //
+    // Handy Quantities
+    //
+    wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0};
+    wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1;
+    wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX};
+    
+    
+    //
+    // More Handy Quantities
+    //
+    reg [INDEX_WIDTH:0] load_t1t2_addr_last;
+    reg [INDEX_WIDTH:0] load_nn_coeff_addr_last;
+
+    
+    //
+    // Flags
+    //
+    assign load_addr_zero          = load_xy_addr_reg == load_xy_addr_zero;
+    assign load_t1t2_addr_done     = load_xy_addr_reg == load_t1t2_addr_last;
+    assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last;
+    
+    
+    //
+    // Last Index Latch
+    //
+    always @(posedge clk)
+        //
+        if (ena && (fsm_state == FSM_STATE_IDLE)) begin
+            load_t1t2_addr_last     <= {1'b0, index_last};
+            load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1;
+        end
+    
+
+    //
+    // Update Load Phase
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_LOAD_T1T2_1,
+            FSM_STATE_LOAD_T1T2_2,
+            FSM_STATE_LOAD_T1T2_3:      load_phase_reg <= 1'b0;
+            FSM_STATE_LOAD_NN_COEFF_1,
+            FSM_STATE_LOAD_NN_COEFF_2,
+            FSM_STATE_LOAD_NN_COEFF_3:  load_phase_reg <= 1'b1;
+            default:                    load_phase_reg <= 1'bX;
+        endcase
+    
+    
+    //
+    // Update Load Address
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_LOAD_T1T2_1:     load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero;
+            FSM_STATE_LOAD_T1T2_2,
+            FSM_STATE_LOAD_T1T2_3:     load_xy_addr_reg <= load_xy_addr_reg;
+            FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero;
+            FSM_STATE_LOAD_NN_COEFF_2,
+            FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg;
+            default                    load_xy_addr_reg <= load_xy_addr_xxx;
+        endcase
+
+    
+    //
+    // Update Address Valid Flag
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_LOAD_T1T2_1,
+            FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1;
+            default                    load_xy_addr_vld_reg <= 1'b0;
+        endcase
+
+        
+    //
+    // Update Load Request Flag
+    //
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_LOAD_T1T2_2,
+            FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1;
+            default                    load_xy_req_reg <= 1'b0;
+        endcase
+
+ 
+endmodule
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v
new file mode 100644
index 0000000..99a37fa
--- /dev/null
+++ b/rtl/modexpng_mmm_x8_dual.v
@@ -0,0 +1,550 @@
+module modexpng_mmm_x8_dual
+(
+    clk, rst_n,
+    ena, rdy,
+    mode, transfer,
+    index_last,
+    x_din,      y_din,      x_dout,      y_dout,
+    x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
+    x_din_ena,  y_din_ena,  x_dout_ena,  y_dout_ena, x_din_reg_ena, y_din_reg_ena,
+    x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
+    load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
+    load_x_din, load_y_din
+);
+
+
+    //
+    // Includes
+    //
+    `include "modexpng_parameters.vh"
+    `include "modexpng_parameters_x8.vh"
+    `include "modexpng_mmm_fsm.vh"
+
+
+    //
+    // Parameters
+    //
+    parameter INDEX_WIDTH = 6;
+    
+    
+    //
+    // Ports
+    //
+    input  clk;
+    input  rst_n;
+    
+    input  ena;
+    output rdy;
+        
+    input  mode;        // multiply:    0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
+                        // load/unload: 0 = load, 1 = unload
+    input  transfer;    // 0 = multiply, 1 = load/unload
+        
+    input  [INDEX_WIDTH-1:0] index_last;
+
+    input  [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+    input  [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+    output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+    output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+    
+    output [INDEX_WIDTH-4:0] x_din_addr;
+    output [INDEX_WIDTH-4:0] y_din_addr;
+    output [INDEX_WIDTH-4:0] x_dout_addr;
+    output [INDEX_WIDTH-4:0] y_dout_addr;
+    
+    output [        1-1:0] x_din_ena;
+    output [        1-1:0] y_din_ena;
+    output [NUM_MULTS-1:0] x_dout_ena;
+    output [NUM_MULTS-1:0] y_dout_ena;
+    output [        1-1:0] x_din_reg_ena;
+    output [        1-1:0] y_din_reg_ena;
+    
+    output [3-1:0] x_din_bank;
+    output [3-1:0] y_din_bank;
+    output [3-1:0] x_dout_bank;
+    output [3-1:0] y_dout_bank;
+    
+    output                  load_phase;         // 0 = T1, T2; 1 = N, N_COEFF
+    output [ INDEX_WIDTH:0] load_xy_addr;       // address
+    output                  load_xy_addr_vld;   // address valid
+    output                  load_xy_req;        // data request
+    
+    input  [WORD_WIDTH-1:0] load_x_din;         // data input
+    input  [WORD_WIDTH-1:0] load_y_din;         // data input
+
+
+    //
+    // FSM State and Next States
+    //
+    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
+    reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+    
+
+    //
+    // FSM Idle Next State
+    //
+    always @*
+        //
+        case ({transfer, mode})
+            2'b00,
+            2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
+            2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
+            2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
+        endcase
+
+
+    //
+    // Column Counter
+    //
+    wire [    INDEX_WIDTH-4:0] col_index;
+    wire                       col_index_done;
+    wire [    INDEX_WIDTH-4:0] col_index_zero;
+    wire [    INDEX_WIDTH-4:0] col_index_next;
+    wire [    INDEX_WIDTH-4:0] col_index_prev;
+
+    modexpng_mmm_col_index #
+    (
+        .INDEX_WIDTH(INDEX_WIDTH)
+    )
+    mmm_col_index
+    (
+        .clk            (clk),
+        .index_last     (index_last),
+        .fsm_state_next (fsm_state_next),
+        .col_index      (col_index),
+        .col_index_done (col_index_done),
+        .col_index_zero (col_index_zero),
+        .col_index_next (col_index_next),
+        .col_index_prev (col_index_prev)
+    );
+
+
+    //
+    // Load Address Generator
+    //
+    wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
+    wire load_addr_zero;
+    wire load_t1t2_addr_done;
+    wire load_nn_coeff_addr_done;
+
+    modexpng_mmm_transporter #
+    (
+        .INDEX_WIDTH(INDEX_WIDTH)
+    )
+    transporter
+    (
+        .clk                        (clk),
+        .ena                        (ena),
+        .index_last                 (index_last),
+        .fsm_state                  (fsm_state),
+        .fsm_state_next             (fsm_state_next),
+        .load_phase                 (load_phase),
+        .load_xy_addr               (load_xy_addr),
+        .load_xy_addr_vld           (load_xy_addr_vld),
+        .load_xy_req                (load_xy_req),
+        .load_addr_zero             (load_addr_zero),    
+        .load_t1t2_addr_done        (load_t1t2_addr_done),
+        .load_nn_coeff_addr_done    (load_nn_coeff_addr_done)
+    );
+ 
+ 
+    //
+    // X, Y Address
+    //
+    wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
+    wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
+    wire [          3-1:0] x_din_addr_cnt_lower_prev;
+    wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
+
+    modexpng_mmm_din_addr #
+    (
+        .INDEX_WIDTH(INDEX_WIDTH)
+    )
+    din_addr_x
+    (
+        .clk                        (clk),
+        .rst_n                      (rst_n),
+        .index_last                 (index_last),
+        .fsm_state_next             (fsm_state_next),
+        .col_index_zero             (col_index_zero),
+        .col_index_next             (col_index_next),
+        .din_addr                   (x_din_addr),
+        .din_bank                   (x_din_bank),
+        .din_ena                    (x_din_ena),
+        .din_reg_ena                (x_din_reg_ena),
+        .din_addr_cnt               (x_din_addr_cnt),
+        .din_addr_cnt_last          (x_din_addr_cnt_last),
+        .din_addr_cnt_lower_prev    (x_din_addr_cnt_lower_prev),
+        .din_addr_cnt_upper_prev    (x_din_addr_cnt_upper_prev)
+    );
+    
+    modexpng_mmm_dout_addr #
+    (
+        .INDEX_WIDTH(INDEX_WIDTH)
+    )
+    dout_addr_xy
+    (
+        .clk                        (clk),
+        .rst_n                      (rst_n),
+        .fsm_state                  (fsm_state),
+        .load_xy_addr               (load_xy_addr),
+        .load_addr_zero             (load_addr_zero),
+        .load_nn_coeff_addr_done    (load_nn_coeff_addr_done),
+        .x_dout_addr                (x_dout_addr),
+        .y_dout_addr                (y_dout_addr),
+        .x_dout_ena                 (x_dout_ena),
+        .y_dout_ena                 (y_dout_ena),
+        .x_dout_bank                (x_dout_bank),
+        .y_dout_bank                (y_dout_bank)
+    );          
+  
+
+    //
+    // Helper Memories ("Scratchpad")
+    //    
+    reg  [INDEX_WIDTH-1:0] pad_xy_rd_addr;
+    reg                    pad_xy_rd_ena = 1'b0;
+    wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
+    wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
+    
+    wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
+    wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
+    
+    modexpng_mmm_pad pad
+    (
+        .clk                (clk),
+        .rst_n              (rst_n),
+        .fsm_state          (fsm_state),
+        .load_xy_addr_lsb   (load_xy_addr_lsb),
+        .load_x_din         (load_x_din),
+        .load_y_din         (load_y_din),
+        .pad_x_rd_addr      (pad_xy_rd_addr),
+        .pad_y_rd_addr      (pad_xy_rd_addr),
+        .pad_x_rd_ena       (pad_xy_rd_ena),
+        .pad_y_rd_ena       (pad_xy_rd_ena),
+        .pad_x_rd_dout      (pad_x_rd_dout),
+        .pad_y_rd_dout      (pad_y_rd_dout)
+    );
+    
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (!rst_n) begin
+            pad_xy_rd_ena <= 1'b0;
+        end else case (fsm_state_next)
+        
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                pad_xy_rd_ena <= 1'b1;
+                
+            default:
+                pad_xy_rd_ena <= 1'b0;
+        
+        endcase
+        
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+                pad_xy_rd_addr <= pad_xy_rd_addr_zero;
+                
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+                pad_xy_rd_addr <= pad_xy_rd_addr_next;
+                
+            default:
+                pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
+        
+        endcase
+  
+  
+    
+    
+    //
+    // Flags
+    //
+
+    wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
+            
+    always @*
+        //
+        fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
+    
+    
+    //
+    // MAC Arrays
+    //
+    reg                                mac_x_ce = 1'b0;
+    reg                                mac_x_ce_aux = 1'b0;
+    reg  [NUM_MULTS              -1:0] mac_x_clr;
+    reg                                mac_x_clr_aux;
+    reg  [NUM_MULTS              -2:0] mac_x_casc_a;
+    reg                                mac_x_casc_a_aux;
+    wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
+    reg  [        1 * WORD_WIDTH -1:0] mac_x_a_aux;
+    //wire [        1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
+    reg  [        1 * WORD_WIDTH -1:0] mac_x_b;
+    wire [NUM_MULTS * MAC_WIDTH  -1:0] mac_x_p;
+    wire [        1 * MAC_WIDTH  -1:0] mac_x_p_aux;
+
+    reg                                mac_y_ce = 1'b0;
+    reg                                mac_y_ce_aux = 1'b0;
+    reg  [NUM_MULTS              -1:0] mac_y_clr;
+    reg                                mac_y_clr_aux;
+    reg  [NUM_MULTS              -2:0] mac_y_casc_a;
+    reg                                mac_y_casc_a_aux;
+    wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
+    reg  [        1 * WORD_WIDTH -1:0] mac_y_a_aux;
+    //wire [        1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
+    reg  [        1 * WORD_WIDTH -1:0] mac_y_b;
+    wire [NUM_MULTS * MAC_WIDTH  -1:0] mac_y_p;
+    wire [        1 * MAC_WIDTH  -1:0] mac_y_p_aux;
+    
+    modexpng_mac_array mac_array_x
+    (
+        .clk        (clk),
+        .ce         (mac_x_ce),
+        .ce_aux     (mac_x_ce_aux),
+        .clr        (mac_x_clr),
+        .clr_aux    (mac_x_clr_aux),
+        .casc_a     (mac_x_casc_a),
+        .casc_a_aux (mac_x_casc_a_aux),
+        .a_in       (mac_x_a),
+        .a_in_aux   (mac_x_a_aux),
+        .b_in       (mac_x_b),
+        .p_out      (mac_x_p),
+        .p_out_aux  (mac_x_p_aux)
+    );
+
+    modexpng_mac_array mac_array_y
+    (
+        .clk        (clk),
+        .ce         (mac_y_ce),
+        .ce_aux     (mac_y_ce_aux),
+        .clr        (mac_y_clr),
+        .clr_aux    (mac_y_clr_aux),
+        .casc_a     (mac_y_casc_a),
+        .casc_a_aux (mac_y_casc_a_aux),
+        .a_in       (mac_y_a),
+        .a_in_aux   (mac_y_a_aux),
+        .b_in       (mac_y_b),
+        .p_out      (mac_y_p),
+        .p_out_aux  (mac_y_p_aux)
+    );
+
+    genvar gen_z;
+   
+    generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+        begin : gen_xy_din
+            //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+            //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+            //gen_xy_dout
+            assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
+            
+            //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+            //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+        end
+    endgenerate
+    
+
+    //
+    // MAC Clock Enable Logic
+    //
+    reg mac_xy_ce_adv = 1'b0;
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
+        else case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
+            default:                          mac_xy_ce_adv <= 1'b0;
+        endcase
+    
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
+        else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
+
+
+    //
+    // MAC Clear Logic
+    //
+    wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
+        calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
+    
+    reg [NUM_MULTS-1:0] mac_xy_clr_adv;
+    
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
+            default:                          mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
+        endcase
+        
+    always @(posedge clk)
+        //
+        {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
+
+
+    //
+    // MAC Cascade Logic
+    //
+    reg  [NUM_MULTS-2:0] mac_xy_casc_a_adv;
+
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
+            default:                          mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
+        endcase
+
+    always @(posedge clk)
+        //
+        {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
+
+
+
+    //
+    // DOUT Mapping
+    //
+    generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+        begin : gen_xy_dout
+            assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+            assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+        end
+    endgenerate
+    
+    
+    //
+    // DOUT
+    //
+    reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
+    reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
+    
+    
+    
+
+    integer int_z;
+    always @(posedge clk)
+        //
+        case (fsm_state)
+            //
+            FSM_STATE_LOAD_T1T2_3,
+            FSM_STATE_LOAD_NN_COEFF_3:
+                for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+                    x_dout_reg[int_z] <= load_x_din;
+                    y_dout_reg[int_z] <= load_y_din;
+                end
+            //
+            default:
+                for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+                    x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+                    y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+                end
+            //
+        endcase
+    
+
+
+    //
+    // FSM Process
+    //
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+        else               fsm_state <= fsm_state_next;
+
+
+    //
+    // FSM Transition Logic
+    //
+    always @* begin
+        //
+        fsm_state_next = FSM_STATE_IDLE;
+        //
+        case (fsm_state)
+            FSM_STATE_IDLE:                   fsm_state_next = ena                       ? fsm_state_after_idle             : FSM_STATE_IDLE;
+            
+            FSM_STATE_LOAD_T1T2_1:            fsm_state_next = FSM_STATE_LOAD_T1T2_2     ;
+            FSM_STATE_LOAD_T1T2_2:            fsm_state_next = FSM_STATE_LOAD_T1T2_3     ;
+            FSM_STATE_LOAD_T1T2_3:            fsm_state_next = load_t1t2_addr_done       ? FSM_STATE_LOAD_NN_COEFF_1        : FSM_STATE_LOAD_T1T2_1;
+            
+            FSM_STATE_LOAD_NN_COEFF_1:        fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
+            FSM_STATE_LOAD_NN_COEFF_2:        fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
+            FSM_STATE_LOAD_NN_COEFF_3:        fsm_state_next = load_nn_coeff_addr_done   ? FSM_STATE_STOP                   : FSM_STATE_LOAD_NN_COEFF_1;
+            
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                             FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done     ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                             FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done     ? fsm_state_after_mult_square      : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+            
+            /*
+            FSM_STATE_TRIANGLE_COL_0_TRIG:  fsm_state_next =                        FSM_STATE_TRIANGLE_COL_0_BUSY  ;
+            FSM_STATE_TRIANGLE_COL_0_BUSY:  fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG  : FSM_STATE_TRIANGLE_COL_0_BUSY;
+            FSM_STATE_TRIANGLE_COL_N_TRIG:  fsm_state_next =                        FSM_STATE_TRIANGLE_COL_N_BUSY  ;
+            FSM_STATE_TRIANGLE_COL_N_BUSY:  fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle       : FSM_STATE_TRIANGLE_COL_N_BUSY;
+            
+            FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next =                        FSM_STATE_RECTANGLE_COL_0_BUSY ;
+            FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
+            FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next =                        FSM_STATE_RECTANGLE_COL_N_BUSY ; 
+            FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle      : FSM_STATE_RECTANGLE_COL_N_BUSY;
+            */
+            
+            FSM_STATE_STOP:                 fsm_state_next =                        FSM_STATE_IDLE                 ;
+
+        endcase
+        //
+    end
+
+
+    //
+    // Ready Output
+    //
+    reg rdy_reg = 1'b1;
+    assign rdy = rdy_reg;
+
+    always @(posedge clk or negedge rst_n)
+        //
+        if (rst_n == 1'b0)           rdy_reg <= 1'b1;
+        else case (fsm_state)
+            FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
+            FSM_STATE_STOP:          rdy_reg <= 1'b1;
+        endcase
+
+    function  [  NUM_MULTS-1:0] calc_mac_clear_square;
+        input [INDEX_WIDTH-4:0] col_index_delayed;
+        input [          3-1:0] x_din_addr_cnt_lower_delayed;
+        input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
+        begin
+            if (x_din_addr_cnt_upper_delayed == col_index_delayed)
+                case (x_din_addr_cnt_lower_delayed)
+                    3'b000: calc_mac_clear_square = 8'b00000001;
+                    3'b001: calc_mac_clear_square = 8'b00000010;
+                    3'b010: calc_mac_clear_square = 8'b00000100;
+                    3'b011: calc_mac_clear_square = 8'b00001000;
+                    3'b100: calc_mac_clear_square = 8'b00010000;
+                    3'b101: calc_mac_clear_square = 8'b00100000;
+                    3'b110: calc_mac_clear_square = 8'b01000000;
+                    3'b111: calc_mac_clear_square = 8'b10000000;
+                endcase
+            else
+                calc_mac_clear_square = {NUM_MULTS{1'b0}};
+        end
+    endfunction
+
+ 
+endmodule
diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh
new file mode 100644
index 0000000..f846119
--- /dev/null
+++ b/rtl/modexpng_parameters.vh
@@ -0,0 +1,39 @@
+//localparam WORD_WIDTH  = 17;
+//localparam MAC_WIDTH   = 47;
+
+//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere!
+
+localparam [2:0] BANK_FAT_T1T2   = 3'd0;
+localparam [2:0] BANK_FAT_ABL    = 3'd1;
+localparam [2:0] BANK_FAT_ABH    = 3'd2;
+localparam [2:0] BANK_FAT_Q      = 3'd3;
+localparam [2:0] BANK_FAT_Q_EXT  = 3'd4;
+localparam [2:0] BANK_FAT_ML     = 3'd5;
+localparam [2:0] BANK_FAT_MH     = 3'd6;
+localparam [2:0] BANK_FAT_MH_EXT = 3'd7;
+
+localparam [1:0] BANK_SLIM_T1T2        = 2'd0;
+localparam [1:0] BANK_SLIM_N           = 2'd1;
+localparam [1:0] BANK_SLIM_N_COEFF     = 2'd2;
+localparam [1:0] BANK_SLIM_N_COEFF_EXT = 2'd3;
+
+
+//localparam BANK_Y_T2      = 3'd0;
+//localparam BANK_XY_T1T2   = 3'd0;
+
+//localparam BANK_XY_AB_LSB = 3'd1;
+//localparam BANK_XY_AB_MSB = 3'd2;
+
+//localparam BANK_X_N       = 3'd3;
+//localparam BANK_Y_N_COEFF = 3'd3;
+
+//localparam BANK_XY_M      = 3'd4;
+
+//localparam BANK_XY_Q_LSB  = 3'd5;
+//localparam BANK_XY_Q_MSB  = 3'd6;
+
+//localparam BANK_XY_AUX    = 3'd7;
+
+//localparam BANK_XY_ANY    = 3'bXXX;
+
+//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8.vh
new file mode 100644
index 0000000..8734354
--- /dev/null
+++ b/rtl/modexpng_parameters_x8.vh
@@ -0,0 +1 @@
+localparam NUM_MULTS = 8;
diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v
new file mode 100644
index 0000000..db4774b
--- /dev/null
+++ b/rtl/modexpng_part_recombinator.v
@@ -0,0 +1,623 @@
+module modexpng_part_recombinator
+(
+    clk,
+    rdy,
+    fsm_state_next,
+    index_last,
+    dsp_x_ce_p, dsp_y_ce_p,
+    ena_x,   ena_y,
+    dsp_x_p, dsp_y_p,
+    col_index, col_index_last, slim_bram_xy_addr,
+    fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid
+);
+
+
+    //
+    // Headers
+    //
+    `include "../rtl/modexpng_mmm_fsm.vh"
+    `include "../rtl/modexpng_parameters.vh"
+    `include "../rtl/modexpng_parameters_x8.vh"
+
+
+    input                        clk;
+    output                       rdy;
+    input  [FSM_STATE_WIDTH-1:0] fsm_state_next;
+    input [7:0]                  index_last;
+    input                        dsp_x_ce_p;
+    input                        dsp_y_ce_p;
+    input                        ena_x;
+    input                        ena_y;
+    input  [8*47-1:0] dsp_x_p;
+    input  [8*47-1:0] dsp_y_p;
+    input  [     4:0] col_index;
+    input  [     4:0] col_index_last;
+    input  [     7:0] slim_bram_xy_addr;
+
+    output [     2:0] fat_bram_xy_bank;
+    output [     7:0] fat_bram_xy_addr;
+    output [    17:0] fat_bram_x_dout;
+    output [    17:0] fat_bram_y_dout;
+    output            fat_bram_xy_dout_valid;
+
+
+    //
+    // Latches
+    //
+    reg  [1*47-1:0] dsp_x_p_latch[0:7];
+    reg  [1*47-1:0] dsp_y_p_latch[0:7];
+
+
+    //
+    // Mapping
+    //
+    wire [46:0] dsp_x_p_split[0:7];
+    wire [46:0] dsp_y_p_split[0:7];
+    
+    genvar z;
+    generate for (z=0; z<NUM_MULTS; z=z+1)
+        begin : gen_dsp_xy_p_split
+            assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
+            assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
+        end
+    endgenerate
+
+
+    //
+    // Delays
+    //
+    reg dsp_y_ce_p_dly1 = 1'b0;
+    reg dsp_x_ce_p_dly1 = 1'b0;
+
+    always @(posedge clk) begin
+        //
+        {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p,      dsp_x_ce_p};
+        //
+    end
+
+
+    //
+    // Registers
+    //
+    
+    // valid
+    reg       x_valid_lsb = 1'b0;
+    reg       y_valid_lsb = 1'b0;
+    reg       x_valid_msb = 1'b0;
+    reg       y_valid_msb = 1'b0;
+    
+    // bitmap
+    reg [7:0] x_bitmap_lsb = {8{1'b0}};
+    reg [7:0] y_bitmap_lsb = {8{1'b0}};
+    reg [7:0] x_bitmap_msb = {8{1'b0}};
+    reg [7:0] y_bitmap_msb = {8{1'b0}};
+    
+    // index
+    reg [2:0] x_index_lsb = 3'dX;
+    reg [2:0] y_index_lsb = 3'dX;
+    
+    // purge
+    reg       x_purge_lsb = 1'b0;
+    reg       y_purge_lsb = 1'b0;
+    reg       x_purge_msb = 1'b0;
+    reg       y_purge_msb = 1'b0;
+    
+    // valid - latch
+    reg       x_valid_latch_lsb = 1'b0;
+    reg       y_valid_latch_lsb = 1'b0;
+    
+    // bitmap - latch
+    reg [7:0] x_bitmap_latch_lsb = {8{1'b0}};
+    reg [7:0] y_bitmap_latch_lsb = {8{1'b0}};
+    reg [7:0] x_bitmap_latch_msb = {8{1'b0}};
+    reg [7:0] y_bitmap_latch_msb = {8{1'b0}};
+
+    // index - latch
+    reg [2:0] x_index_latch_lsb = 3'dX;
+    reg [2:0] y_index_latch_lsb = 3'dX;
+    
+    // purge - index
+    reg       x_purge_latch_lsb = 1'b0;
+    reg       y_purge_latch_lsb = 1'b0;
+    reg       x_purge_latch_msb = 1'b0;
+    reg       y_purge_latch_msb = 1'b0;
+
+    // 
+    reg       xy_valid_lsb_adv[1:6];
+    reg       xy_valid_msb_adv[1:6];
+    reg [7:0] xy_bitmap_lsb_adv[1:6];
+    reg [7:0] xy_bitmap_msb_adv[1:6];
+    reg [2:0] xy_index_lsb_adv[1:6];
+    reg [2:0] xy_index_msb_adv[1:6];
+    reg       xy_purge_lsb_adv[1:6];
+    reg       xy_purge_msb_adv[1:6];
+    
+    
+    integer i;
+    initial for (i=1; i<6; i=i+1) begin
+        xy_valid_lsb_adv[i] = 1'b0;
+        xy_valid_msb_adv[i] = 1'b0;
+        xy_bitmap_lsb_adv[i] = {8{1'b0}};
+        xy_bitmap_msb_adv[i] = {8{1'b0}};
+        xy_index_lsb_adv[i] = 3'dX;
+        xy_index_msb_adv[i] = 3'dX;
+        xy_purge_lsb_adv[i] = 1'b0;
+        xy_purge_msb_adv[i] = 1'b0;
+    end
+    
+    function  [0:0] calc_square_valid_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value[7:3] == col_index_value)
+                calc_square_valid_lsb = 1'b1;
+            else
+                calc_square_valid_lsb = 1'b0;
+            //
+        end
+    endfunction
+    
+    function  [7:0] calc_square_bitmap_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value[7:3] == col_index_value)
+                //
+                case (slim_bram_xy_addr_value[2:0])
+                    3'b000: calc_square_bitmap_lsb = 8'b00000001;
+                    3'b001: calc_square_bitmap_lsb = 8'b00000010;
+                    3'b010: calc_square_bitmap_lsb = 8'b00000100;
+                    3'b011: calc_square_bitmap_lsb = 8'b00001000;
+                    3'b100: calc_square_bitmap_lsb = 8'b00010000;
+                    3'b101: calc_square_bitmap_lsb = 8'b00100000;
+                    3'b110: calc_square_bitmap_lsb = 8'b01000000;
+                    3'b111: calc_square_bitmap_lsb = 8'b10000000;
+                endcase
+                //
+            else
+                calc_square_bitmap_lsb = {8{1'b0}};
+            //
+        end
+    endfunction
+
+    function  [2:0] calc_square_index_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value[7:3] == col_index_value)
+                //
+                case (slim_bram_xy_addr_value[2:0])
+                    3'b000: calc_square_index_lsb = 3'd0;
+                    3'b001: calc_square_index_lsb = 3'd1;
+                    3'b010: calc_square_index_lsb = 3'd2;
+                    3'b011: calc_square_index_lsb = 3'd3;
+                    3'b100: calc_square_index_lsb = 3'd4;
+                    3'b101: calc_square_index_lsb = 3'd5;
+                    3'b110: calc_square_index_lsb = 3'd6;
+                    3'b111: calc_square_index_lsb = 3'd7;
+                endcase
+                //
+            else
+                calc_square_index_lsb = 3'dX;
+            //
+        end
+    endfunction
+    
+    function        calc_square_purge_lsb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value[7:3] == col_index_value)
+                calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
+            else
+                calc_square_purge_lsb = 1'b0;
+            //
+        end
+    endfunction
+
+    function        calc_square_valid_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value == index_last_value)
+                calc_square_valid_msb = 1'b1;
+            else
+                calc_square_valid_msb = 1'b0;
+            //
+        end
+    endfunction
+    
+    function  [7:0] calc_square_bitmap_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value == index_last_value) begin
+                calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
+                calc_square_bitmap_msb[6:0] = 7'b1111111;
+            end else
+                calc_square_bitmap_msb[7:0] = 8'b00000000;
+            //
+        end
+    endfunction
+
+    function        calc_square_purge_msb;
+        input [4:0] col_index_value;
+        input [4:0] col_index_last_value;
+        input [7:0] slim_bram_xy_addr_value;
+        input [7:0] index_last_value;
+        begin
+            //
+            if (slim_bram_xy_addr_value == index_last_value)
+                calc_square_purge_msb = col_index_value == col_index_last_value;
+            else
+                calc_square_purge_msb = 1'b0;
+            //
+        end
+    endfunction
+
+    
+    reg         recomb_lsb_ce = 1'b0;
+    reg  [ 2:0] recomb_lsb_ce_purge = 3'b000;
+    wire        recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0];
+    reg         recomb_lsb_clr;
+    reg         recomb_lsb_vld = 1'b0;
+
+    reg  [46:0] recomb_lsb_din;
+    wire [15:0] recomb_lsb_dout;
+
+    reg         recomb_msb_ce = 1'b0;
+    reg  [ 1:0] recomb_msb_ce_purge = 2'b00;
+    wire        recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0];
+    reg         recomb_msb_clr;
+    reg         recomb_msb_vld = 1'b0;
+    
+    always @(posedge clk)
+        //
+        {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined};
+
+    reg  [46:0] recomb_msb_din;
+    wire [15:0] recomb_msb_dout;
+    
+    modexpng_recombinator_block recomb_x_lsb
+    (
+        .clk    (clk),
+        .ce     (recomb_lsb_ce_combined),
+        .clr    (recomb_lsb_clr),
+        .din    (recomb_lsb_din),
+        .dout   (recomb_lsb_dout)
+    );
+
+    modexpng_recombinator_block recomb_x_msb
+    (
+        .clk    (clk),
+        .ce     (recomb_msb_ce_combined),
+        .clr    (recomb_msb_clr),
+        .din    (recomb_msb_din),
+        .dout   (recomb_msb_dout)
+    );
+
+    always @(posedge clk) begin
+        //
+        recomb_lsb_ce <= x_valid_latch_lsb;
+        recomb_msb_ce <= x_bitmap_latch_msb[0];
+        //
+        if (x_purge_latch_lsb)
+            recomb_lsb_ce_purge <= 3'b111;
+        else
+            recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]};
+        //
+        if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1])
+            recomb_msb_ce_purge = 2'b11;
+        else
+            recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]};
+        //
+    end
+
+
+    always @(posedge clk)
+        //
+        if (ena_x & ena_y) begin
+            recomb_lsb_clr <= 1'b1;
+            recomb_msb_clr <= 1'b1;
+        end else begin
+            if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0;
+            if (recomb_msb_ce) recomb_msb_clr <= 1'b0;
+        end
+
+    always @(posedge clk)
+        //
+        if (x_valid_latch_lsb)
+            recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb];
+        else
+            recomb_lsb_din <= {47{1'b0}};
+
+    always @(posedge clk)
+        //
+        if (x_bitmap_latch_msb[0])
+            recomb_msb_din <= dsp_x_p_latch[0];
+        else
+            recomb_msb_din <= {47{1'b0}};
+
+
+    always @(posedge clk)
+        //
+        case (fsm_state_next)
+            //
+            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+            FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+                //
+                xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr);
+                xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr);
+                xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr);
+                xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
+                //
+                xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+                xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last);
+                xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+                //
+            end
+            //
+            default: begin
+                //
+                xy_valid_lsb_adv [6] <= 1'b0;
+                xy_bitmap_lsb_adv[6] <= {8{1'b0}};
+                xy_index_lsb_adv [6] <= 3'dX;
+                xy_purge_lsb_adv [6] <= 1'b0;
+                //
+                xy_valid_msb_adv [6] <= 1'b0;
+                xy_bitmap_msb_adv[6] <= {8{1'b0}};
+                xy_purge_msb_adv [6] <= 1'b0;
+                //
+            end
+            //
+        endcase
+
+
+    always @(posedge clk) begin
+        //
+        {y_valid_lsb,  x_valid_lsb}  <= {2{xy_valid_lsb_adv [1]}};
+        {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}};
+        {y_index_lsb,  x_index_lsb}  <= {2{xy_index_lsb_adv [1]}};
+        {y_purge_lsb,  x_purge_lsb}  <= {2{xy_purge_lsb_adv [1]}};
+        //
+        {y_valid_latch_lsb,  x_valid_latch_lsb}  <= {y_valid_lsb,  x_valid_lsb};
+        {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb};
+        {y_index_latch_lsb,  x_index_latch_lsb}  <= {y_index_lsb,  x_index_lsb};
+        {y_purge_latch_lsb,  x_purge_latch_lsb}  <= {y_purge_lsb,  x_purge_lsb};
+        //
+        {y_valid_msb,  x_valid_msb}  <= {2{xy_valid_msb_adv[1]}};
+        {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}};
+        {y_purge_msb,  x_purge_msb}  <= {2{xy_purge_msb_adv[1]}};
+        //
+        if (x_valid_msb) begin
+            x_bitmap_latch_msb <= x_bitmap_msb;
+            x_purge_latch_msb  <= x_purge_msb;
+        end else begin
+            x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]};
+        end
+        //
+        //
+        for (i=1; i<6; i=i+1) begin
+            xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
+            xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
+            xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
+            xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
+            //
+            xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
+            xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
+            xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
+        end
+        //
+    end
+
+    always @(posedge clk)
+        //
+        if (x_bitmap_latch_msb[1])   // only shift 7 times
+            //
+            for (i=0; i<8; i=i+1)            
+                if (i < 7)
+                    dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
+                else
+                    dsp_x_p_latch[i] <= {47{1'bX}};
+            //
+        else if (dsp_x_ce_p_dly1)
+            //
+            for (i=0; i<8; i=i+1)
+                //
+                if (x_bitmap_lsb[i])
+                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
+                else if (x_valid_msb && x_bitmap_msb[i])
+                    dsp_x_p_latch[i] <= dsp_x_p_split[i];
+
+    reg recomb_x_lsb_dout_valid = 1'b0;
+    reg recomb_x_msb_dout_valid = 1'b0;
+
+    always @(posedge clk) begin
+        recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined;
+        recomb_x_msb_dout_valid <= recomb_msb_ce_combined;
+    end
+        
+
+
+    reg [ 2:0] fat_bram_xy_bank_reg;
+    reg [ 7:0] fat_bram_xy_addr_reg;
+    reg [ 7:0] fat_bram_xy_cnt_lsb;
+    reg [ 7:0] fat_bram_xy_cnt_msb;
+    reg [17:0] fat_bram_x_dout_reg;
+    reg [17:0] fat_bram_y_dout_reg;
+    reg        fat_bram_xy_dout_valid_reg = 1'b0;
+
+    reg [15:0] recomb_msb_dout_carry_0;
+    reg [15:0] recomb_msb_dout_carry_1;
+    
+    reg [15:0] recomb_msb_dout_delay_0;
+    reg [15:0] recomb_msb_dout_delay_1;
+    reg [15:0] recomb_msb_dout_delay_2;
+    
+    reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0;
+    reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0;
+    reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0;
+
+    assign fat_bram_xy_bank       = fat_bram_xy_bank_reg;
+    assign fat_bram_xy_addr       = fat_bram_xy_addr_reg;
+    assign fat_bram_x_dout        = fat_bram_x_dout_reg;
+    assign fat_bram_y_dout        = fat_bram_y_dout_reg;
+    assign fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg;
+    
+    reg rdy_reg = 1'b1;
+    reg rdy_adv = 1'b1;
+    
+    assign rdy = rdy_reg;
+    
+    
+    always @(posedge clk)
+        //
+        if (ena_x & ena_y)
+            rdy_reg <= 1'b0;
+        else
+            rdy_reg <= rdy_adv;
+
+    always @(posedge clk)
+        //
+        if (ena_x & ena_y) begin
+            rdy_adv <= 1'b0;
+            fat_bram_xy_cnt_lsb <= 8'd0;
+            fat_bram_xy_cnt_msb <= 8'd0;
+        end else begin
+            //
+            case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid})
+                //
+                2'b00: begin
+                    //
+                    if (recomb_msb_cnt_delay_2 > 8'd0) begin
+                        //
+                        rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0;
+                        //
+                        recomb_msb_dout_delay_0 <= {18{1'bX}};
+                        recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+                        recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+                        //
+                        recomb_msb_cnt_delay_0 <= 8'd0;
+                        recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+                        recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+                        //
+                        fat_bram_xy_bank_reg        <= BANK_FAT_ABH;
+                        fat_bram_xy_addr_reg        <= recomb_msb_cnt_delay_2;
+                        fat_bram_x_dout_reg         <= recomb_msb_dout_delay_2;
+//                      fat_bram_y_dout_reg         <= {18{1'bX}};
+                        fat_bram_xy_dout_valid_reg  <= 1'b1;
+                        //
+                    end else begin
+                        //
+                        fat_bram_xy_bank_reg        <= 3'bXXX;
+                        fat_bram_xy_addr_reg        <= 8'hXX;
+                        fat_bram_x_dout_reg         <= {18{1'bX}};
+                        fat_bram_y_dout_reg         <= {18{1'bX}};
+                        fat_bram_xy_dout_valid_reg  <= 1'b0;
+                        //
+                    end
+                    //
+                end
+                //
+                2'b01: begin
+                    //
+                    fat_bram_xy_bank_reg        <= BANK_FAT_ABL;
+                    fat_bram_xy_addr_reg        <= fat_bram_xy_cnt_lsb;
+                    fat_bram_x_dout_reg         <= {2'b00, recomb_lsb_dout};
+//                  fat_bram_y_dout_reg
+                    fat_bram_xy_dout_valid_reg  <= 1'b1;
+                    //
+                    fat_bram_xy_cnt_lsb         <= fat_bram_xy_cnt_lsb + 1'b1; 
+                    //
+                end
+                //
+                2'b10: begin
+                    //
+                    if (fat_bram_xy_cnt_msb < 8'd2) begin
+                        //
+                        recomb_msb_dout_carry_0 <= recomb_msb_dout;
+                        recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+                        //
+                        fat_bram_xy_bank_reg        <= 3'bXXX;
+                        fat_bram_xy_addr_reg        <= 8'hXX;
+                        fat_bram_x_dout_reg         <= {18{1'bX}};
+    //                  fat_bram_y_dout_reg
+                        fat_bram_xy_dout_valid_reg  <= 1'b0;
+                        //                        
+                    end else begin
+                        //
+                        fat_bram_xy_bank_reg        <= BANK_FAT_ABH;
+                        fat_bram_xy_addr_reg        <= fat_bram_xy_cnt_msb;
+                        fat_bram_x_dout_reg         <= {2'b00, recomb_msb_dout};
+    //                  fat_bram_y_dout_reg
+                        fat_bram_xy_dout_valid_reg  <= 1'b1;                        
+                        //
+                    end
+                    //
+                    fat_bram_xy_cnt_msb         <= fat_bram_xy_cnt_msb + 1'b1;
+                    //                
+                end
+                //
+                2'b11: begin
+                    //
+                    if (fat_bram_xy_cnt_lsb == index_last) begin
+                        //
+                        fat_bram_xy_bank_reg        <= BANK_FAT_ABL;
+                        fat_bram_xy_addr_reg        <= fat_bram_xy_cnt_lsb;
+                        fat_bram_x_dout_reg         <= {2'b00, recomb_lsb_dout};
+//                      fat_bram_y_dout_reg         <= {18{1'bX}};
+                        fat_bram_xy_dout_valid_reg  <= 1'b1;
+                        //
+                        fat_bram_xy_cnt_lsb         <= 8'd0;
+                        //
+                    end else begin
+                        //
+                        fat_bram_xy_bank_reg        <= BANK_FAT_ABH;
+                        fat_bram_xy_addr_reg        <= fat_bram_xy_cnt_lsb;
+                        fat_bram_x_dout_reg         <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}};
+//                      fat_bram_y_dout_reg         <= {18{1'bX}};
+                        fat_bram_xy_dout_valid_reg  <= 1'b1;
+                        //
+                        fat_bram_xy_cnt_lsb         <= fat_bram_xy_cnt_lsb + 1'b1;
+                        //
+                        recomb_msb_dout_carry_0 <= {16{1'bX}};
+                        recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;                        
+                        //
+                    end
+                    //
+                    recomb_msb_dout_delay_0 <= recomb_msb_dout;
+                    recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+                    recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+                    //
+                    recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb;
+                    recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+                    recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+                    //
+                    fat_bram_xy_cnt_msb         <= fat_bram_xy_cnt_msb + 1'b1;
+                    //
+                end
+                //
+            endcase
+            //        
+        end
+    
+    
+    
+    
+endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
new file mode 100644
index 0000000..efe0ac5
--- /dev/null
+++ b/rtl/modexpng_recombinator_block.v
@@ -0,0 +1,35 @@
+module modexpng_recombinator_block
+(
+    clk,
+    ce, clr,
+    din, dout
+);
+
+    input         clk;
+    input         ce;
+    input         clr;
+    input  [46:0] din;
+    output [15:0] dout;
+
+    reg [14:0] z;
+    reg [16:0] y;
+    reg [17:0] x;
+    //reg [15:0] w;
+
+    //assign dout = w;
+    assign dout = x[15:0];
+    
+    wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here
+    wire [15:0] din_y = din[31:16];
+    wire [15:0] din_x = din[15: 0];
+    
+    always @(posedge clk)
+        //
+        if (ce) begin
+            z <= din_z;
+            y <= clr ? {1'b0, din_y}  : {1'b0, din_y} + {2'b00, z};
+            x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]};
+            //w <= clr ? {16{1'bX}}     : x[15:0];        
+        end
+    
+endmodule



More information about the Commits mailing list