[Cryptech-Commits] [core/math/modexpng] 24/92: Started working on the pipelined Montgomery modular multiplier. Currently can do the "square" part of the multiplication, i.e. compute the twice larger intermediate product AB = A * B.
git at cryptech.is
git at cryptech.is
Sat Mar 14 18:19:03 UTC 2020
This is an automated email from the git hooks/post-receive script.
paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.
commit 29fb6afd018c601a2e0c7376656d5e37beb565d6
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Oct 1 15:01:43 2019 +0300
Started working on the pipelined Montgomery modular multiplier. Currently can
do the "square" part of the multiplication, i.e. compute the twice larger
intermediate product AB = A * B.
---
bench/tb_mmm_x8_dual.v | 327 +++++++++++++++++
bench/tb_square.v | 716 ++++++++++++++++++++++++++++++++++++++
rtl/dev/temp.txt | 384 ++++++++++++++++++++
rtl/dsp/dsp_array.v | 111 ++++++
rtl/dsp/dsp_slice.v | 125 +++++++
rtl/modexpng_mac.v | 54 +++
rtl/modexpng_mac_array.v | 116 ++++++
rtl/modexpng_mem.v | 93 +++++
rtl/modexpng_mmm_col_index.v | 90 +++++
rtl/modexpng_mmm_din_addr.v | 167 +++++++++
rtl/modexpng_mmm_dout_addr.v | 167 +++++++++
rtl/modexpng_mmm_fsm.vh | 24 ++
rtl/modexpng_mmm_pad.v | 153 ++++++++
rtl/modexpng_mmm_transporter.v | 157 +++++++++
rtl/modexpng_mmm_x8_dual.v | 550 +++++++++++++++++++++++++++++
rtl/modexpng_parameters.vh | 39 +++
rtl/modexpng_parameters_x8.vh | 1 +
rtl/modexpng_part_recombinator.v | 623 +++++++++++++++++++++++++++++++++
rtl/modexpng_recombinator_block.v | 35 ++
19 files changed, 3932 insertions(+)
diff --git a/bench/tb_mmm_x8_dual.v b/bench/tb_mmm_x8_dual.v
new file mode 100644
index 0000000..aa25900
--- /dev/null
+++ b/bench/tb_mmm_x8_dual.v
@@ -0,0 +1,327 @@
+`timescale 1ns / 1ps
+
+module tb_mmm_x8_dual;
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+
+
+ //
+ // Settings
+ //
+ localparam INDEX_WIDTH = 6;
+
+ wire [INDEX_WIDTH-1:0] index_last = 31; // 512 bits
+
+
+ //
+ // Clock
+ //
+ `define CLK_FREQUENCY_MHZ 100.0
+ `define CLK_PERIOD_NS (1000.0 / `CLK_FREQUENCY_MHZ)
+ `define CLK_PERIOD_HALF_NS (0.5 * `CLK_PERIOD_NS)
+
+ reg clk = 1'b0;
+
+ always begin
+ #`CLK_PERIOD_HALF_NS clk = 1'b1;
+ #`CLK_PERIOD_HALF_NS clk = 1'b0;
+ end
+
+
+ //
+ // Reset
+ //
+ reg rst = 1'b1;
+ wire rst_n = ~rst;
+
+
+ //
+ // Control
+ //
+ reg ena = 1'b0;
+ wire rdy;
+
+ reg mode;
+ reg transfer;
+
+
+ //
+ // Interface
+ //
+
+
+ //
+ // Interface - Data Buses
+ //
+ wire [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+
+ //
+ // Interface - Address Buses
+ //
+ wire [INDEX_WIDTH-4:0] x_din_addr;
+ wire [INDEX_WIDTH-4:0] y_din_addr;
+ wire [INDEX_WIDTH-4:0] x_dout_addr;
+ wire [INDEX_WIDTH-4:0] y_dout_addr;
+
+
+ //
+ // Interface - Enable Buses
+ //
+ wire [ 1-1:0] x_din_ena;
+ wire [ 1-1:0] y_din_ena;
+ wire [ 1-1:0] x_din_reg_ena;
+ wire [ 1-1:0] y_din_reg_ena;
+ wire [NUM_MULTS-1:0] x_dout_ena;
+ wire [NUM_MULTS-1:0] y_dout_ena;
+
+
+ //
+ // Interface - Bank Buses
+ //
+ wire [3-1:0] x_din_bank;
+ wire [3-1:0] y_din_bank;
+ wire [3-1:0] x_dout_bank;
+ wire [3-1:0] y_dout_bank;
+
+
+ //
+ // Operands
+ //
+ reg [WORD_WIDTH-1:0] T1[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] T2[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] N[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] N_COEFF[0:2**INDEX_WIDTH];
+
+
+ //
+ // Memories
+ //
+ genvar z;
+ generate for (z=0; z<NUM_MULTS; z=z+1)
+ //
+ begin : gen_z_mem
+ //
+ modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+ (
+ .MEM_WIDTH(WORD_WIDTH),
+ .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+ )
+ gen_z_mem_x
+ (
+ .clk (clk),
+
+ .a_addr ({x_dout_bank, x_dout_addr}),
+ .a_en (x_dout_ena[z]),
+ .a_wr (x_dout_ena[z]),
+ .a_in (x_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+ .a_out (), // unused
+
+ .b_addr ({x_din_bank, x_din_addr}),
+ .b_en (x_din_ena),
+ .b_reg_en (x_din_reg_ena),
+ .b_out (x_din[z*WORD_WIDTH+:WORD_WIDTH])
+ );
+ //
+ modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+ (
+ .MEM_WIDTH(WORD_WIDTH),
+ .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+ )
+ gen_z_mem_y
+ (
+ .clk (clk),
+
+ .a_addr ({y_dout_bank, y_dout_addr}),
+ .a_en (y_dout_ena[z]),
+ .a_wr (y_dout_ena[z]),
+ .a_in (y_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+ .a_out (), // unused
+
+ .b_addr ({y_din_bank, y_din_addr}),
+ .b_en (y_din_ena),
+ .b_reg_en (y_din_reg_ena),
+ .b_out (y_din[z*WORD_WIDTH+:WORD_WIDTH])
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+ // T1 / T2
+ // N / N_COEFF
+ // AB_LSB
+ // AB_MSB
+ // M
+ // Q_LSB
+ // Q_MSB
+ // ?
+
+
+ //
+ // Operands - Values
+ //
+ initial begin
+ //
+ T1[ 0] = 18'h0b27b; T1[ 1] = 18'h0fc7d; T1[ 2] = 18'h0a214; T1[ 3] = 18'h08d2b;
+ T1[ 4] = 18'h1c80c; T1[ 5] = 18'h145f1; T1[ 6] = 18'h00db6; T1[ 7] = 18'h1cf0f;
+ T1[ 8] = 18'h19386; T1[ 9] = 18'h02ad9; T1[10] = 18'h1a8b5; T1[11] = 18'h1479b;
+ T1[12] = 18'h08b5f; T1[13] = 18'h14806; T1[14] = 18'h0e6f7; T1[15] = 18'h0ce9d;
+ T1[16] = 18'h0cbc2; T1[17] = 18'h16ef1; T1[18] = 18'h0e14e; T1[19] = 18'h1796f;
+ T1[20] = 18'h14901; T1[21] = 18'h06666; T1[22] = 18'h0cb9f; T1[23] = 18'h09ab4;
+ T1[24] = 18'h12ffc; T1[25] = 18'h0a86d; T1[26] = 18'h19d35; T1[27] = 18'h0cda9;
+ T1[28] = 18'h16a19; T1[29] = 18'h09a36; T1[30] = 18'h0b176; T1[31] = 18'h0e0dc;
+ //
+ T2[ 0] = 18'h0b21a; T2[ 1] = 18'h13e71; T2[ 2] = 18'h03459; T2[ 3] = 18'h1063f;
+ T2[ 4] = 18'h18cef; T2[ 5] = 18'h1b8a5; T2[ 6] = 18'h082d1; T2[ 7] = 18'h1b1be;
+ T2[ 8] = 18'h18979; T2[ 9] = 18'h1409a; T2[10] = 18'h1713c; T2[11] = 18'h0cda3;
+ T2[12] = 18'h11c7d; T2[13] = 18'h0c943; T2[14] = 18'h12d7c; T2[15] = 18'h1531e;
+ T2[16] = 18'h0a45a; T2[17] = 18'h1c637; T2[18] = 18'h0906a; T2[19] = 18'h1670e;
+ T2[20] = 18'h12f78; T2[21] = 18'h08ce6; T2[22] = 18'h1c5c7; T2[23] = 18'h1292d;
+ T2[24] = 18'h0fc4b; T2[25] = 18'h064fb; T2[26] = 18'h0cc3c; T2[27] = 18'h19b37;
+ T2[28] = 18'h1b721; T2[29] = 18'h0f424; T2[30] = 18'h0f608; T2[31] = 18'h03e9b;
+ //
+ N[ 0] = 18'h00a9d; N[ 1] = 18'h01175; N[ 2] = 18'h0254f; N[ 3] = 18'h0ee38;
+ N[ 4] = 18'h00a6a; N[ 5] = 18'h0c7bd; N[ 6] = 18'h0ddac; N[ 7] = 18'h069fe;
+ N[ 8] = 18'h0e9d6; N[ 9] = 18'h0b6bf; N[10] = 18'h09230; N[11] = 18'h04fc5;
+ N[12] = 18'h05c9f; N[13] = 18'h09502; N[14] = 18'h0cbc5; N[15] = 18'h03109;
+ N[16] = 18'h08029; N[17] = 18'h0b27c; N[18] = 18'h0eeb8; N[19] = 18'h0c191;
+ N[20] = 18'h0ff86; N[21] = 18'h027ab; N[22] = 18'h07d76; N[23] = 18'h0ff1a;
+ N[24] = 18'h02afc; N[25] = 18'h0b25a; N[26] = 18'h0d3c1; N[27] = 18'h05589;
+ N[28] = 18'h09f7c; N[29] = 18'h0ddd6; N[30] = 18'h0b4fc; N[31] = 18'h0e8e7;
+ //
+ N_COEFF[ 0] = 18'h0344b; N_COEFF[ 1] = 18'h0ca66; N_COEFF[ 2] = 18'h0d9e8; N_COEFF[ 3] = 18'h070d5;
+ N_COEFF[ 4] = 18'h0ce4b; N_COEFF[ 5] = 18'h049b2; N_COEFF[ 6] = 18'h0abb3; N_COEFF[ 7] = 18'h0c3b2;
+ N_COEFF[ 8] = 18'h0ad38; N_COEFF[ 9] = 18'h05672; N_COEFF[10] = 18'h0fd47; N_COEFF[11] = 18'h06671;
+ N_COEFF[12] = 18'h00b7f; N_COEFF[13] = 18'h0fa35; N_COEFF[14] = 18'h0d4ac; N_COEFF[15] = 18'h0f1ca;
+ N_COEFF[16] = 18'h08e0a; N_COEFF[17] = 18'h05858; N_COEFF[18] = 18'h02dc6; N_COEFF[19] = 18'h08cfc;
+ N_COEFF[20] = 18'h01941; N_COEFF[21] = 18'h0f855; N_COEFF[22] = 18'h01e43; N_COEFF[23] = 18'h053f0;
+ N_COEFF[24] = 18'h0a479; N_COEFF[25] = 18'h0ae7e; N_COEFF[26] = 18'h05c66; N_COEFF[27] = 18'h02413;
+ N_COEFF[28] = 18'h0b5f8; N_COEFF[29] = 18'h0eb06; N_COEFF[30] = 18'h0de5b; N_COEFF[31] = 18'h0a751;
+ N_COEFF[32] = 18'h0c1ec;
+ //
+ end
+
+
+ //
+ // Load Interface
+ //
+ wire load_phase;
+ wire [ INDEX_WIDTH:0] load_xy_addr;
+ wire load_xy_addr_vld;
+ wire load_xy_req;
+ reg [ WORD_WIDTH-1:0] load_x_din;
+ reg [ WORD_WIDTH-1:0] load_y_din;
+ reg [ WORD_WIDTH-1:0] load_x_pipe;
+ reg [ WORD_WIDTH-1:0] load_y_pipe;
+
+ always @(posedge clk)
+ //
+ if (load_xy_addr_vld) begin
+
+ if (!load_phase) begin
+ load_x_pipe <= T1[load_xy_addr];
+ load_y_pipe <= T2[load_xy_addr];
+ end else begin
+ load_x_pipe <= !load_xy_addr[INDEX_WIDTH] ? N[load_xy_addr] : {WORD_WIDTH{1'bX}};
+ load_y_pipe <= N_COEFF[load_xy_addr];
+ end
+ end
+
+ always @(posedge clk)
+ //
+ if (load_xy_req)
+ {load_y_din, load_x_din} <= {load_y_pipe, load_x_pipe};
+ else
+ {load_y_din, load_x_din} <= {2*WORD_WIDTH{1'bX}};
+
+
+ //
+ // UUT
+ //
+ modexpng_mmm_x8_dual #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ uut
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (ena),
+ .rdy (rdy),
+
+ .mode (mode),
+ .transfer (transfer),
+
+ .index_last (index_last),
+
+ .x_din (x_din),
+ .y_din (y_din),
+ .x_dout (x_dout),
+ .y_dout (y_dout),
+
+ .x_din_addr (x_din_addr),
+ .y_din_addr (y_din_addr),
+ .x_dout_addr (x_dout_addr),
+ .y_dout_addr (y_dout_addr),
+
+ .x_din_ena (x_din_ena),
+ .y_din_ena (y_din_ena),
+ .x_dout_ena (x_dout_ena),
+ .y_dout_ena (y_dout_ena),
+
+ .x_din_reg_ena (x_din_reg_ena),
+ .y_din_reg_ena (y_din_reg_ena),
+
+ .x_din_bank (x_din_bank),
+ .y_din_bank (y_din_bank),
+ .x_dout_bank (x_dout_bank),
+ .y_dout_bank (y_dout_bank),
+
+ .load_phase (load_phase),
+ .load_xy_addr (load_xy_addr),
+ .load_xy_addr_vld (load_xy_addr_vld),
+ .load_xy_req (load_xy_req),
+ .load_x_din (load_x_din),
+ .load_y_din (load_y_din)
+ );
+
+
+ //
+ // Script
+ //
+ initial begin
+ #(100.0*`CLK_PERIOD_NS) rst = 1'b0;
+ #(100.0*`CLK_PERIOD_NS) ena = 1'b1;
+ transfer = 1'b1;
+ mode = 1'b0;
+ #( 1.0*`CLK_PERIOD_NS) ena = 1'b0;
+ transfer = 1'bX;
+ mode = 1'bX;
+
+ while (!rdy) #`CLK_PERIOD_NS;
+
+ #(100.0*`CLK_PERIOD_NS) ena = 1'b1;
+ transfer = 1'b0;
+ mode = 1'b0;
+ #( 1.0*`CLK_PERIOD_NS) ena = 1'b0;
+ transfer = 1'bX;
+ mode = 1'bX;
+
+ while (!rdy) #`CLK_PERIOD_NS;
+
+ end
+
+
+endmodule
+
diff --git a/bench/tb_square.v b/bench/tb_square.v
new file mode 100644
index 0000000..61e5d8a
--- /dev/null
+++ b/bench/tb_square.v
@@ -0,0 +1,716 @@
+`timescale 1ns / 1ps
+
+module tb_square;
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+ `include "../rtl/modexpng_mmm_fsm.vh"
+
+
+ //
+ // Clock
+ //
+ `define CLK_FREQUENCY_MHZ 100.0
+ `define CLK_PERIOD_NS (1000.0 / `CLK_FREQUENCY_MHZ)
+ `define CLK_PERIOD_HALF_NS (0.5 * `CLK_PERIOD_NS)
+
+ reg clk = 1'b0;
+
+ always begin
+ #`CLK_PERIOD_HALF_NS clk = 1'b1;
+ #`CLK_PERIOD_HALF_NS clk = 1'b0;
+ end
+
+
+ //
+ // Reset
+ //
+ reg rst = 1'b1;
+
+
+
+ //
+ // T1, T2
+ //
+ reg [17:0] T1[0:31];
+ reg [17:0] T2[0:31];
+ reg [17:0] AB[0:63];
+
+
+ //
+ // Init
+ //
+ initial begin
+ //
+ T1[ 0] = 18'h0f13e; T1[ 1] = 18'h0daf6; T1[ 2] = 18'h0aaa9; T1[ 3] = 18'h0c2c2;
+ T1[ 4] = 18'h0fc5f; T1[ 5] = 18'h12164; T1[ 6] = 18'h14375; T1[ 7] = 18'h15615;
+ T1[ 8] = 18'h0d8e2; T1[ 9] = 18'h0ec15; T1[10] = 18'h17c46; T1[11] = 18'h0c922;
+ T1[12] = 18'h08f00; T1[13] = 18'h152f9; T1[14] = 18'h0b0b6; T1[15] = 18'h0ce87;
+ T1[16] = 18'h178f2; T1[17] = 18'h09efb; T1[18] = 18'h0409d; T1[19] = 18'h11104;
+ T1[20] = 18'h0b4a6; T1[21] = 18'h158a6; T1[22] = 18'h0514e; T1[23] = 18'h0ec55;
+ T1[24] = 18'h11e73; T1[25] = 18'h11ddd; T1[26] = 18'h07bd4; T1[27] = 18'h0638b;
+ T1[28] = 18'h0e805; T1[29] = 18'h11c4f; T1[30] = 18'h0a2eb; T1[31] = 18'h05454;
+ //
+ T2[ 0] = 18'h1a479; T2[ 1] = 18'h102f5; T2[ 2] = 18'h10e72; T2[ 3] = 18'h120b1;
+ T2[ 4] = 18'h169cd; T2[ 5] = 18'h1d0c4; T2[ 6] = 18'h11462; T2[ 7] = 18'h12015;
+ T2[ 8] = 18'h16fca; T2[ 9] = 18'h1044f; T2[10] = 18'h122b4; T2[11] = 18'h10a5a;
+ T2[12] = 18'h12620; T2[13] = 18'h0e01a; T2[14] = 18'h095cd; T2[15] = 18'h1278a;
+ T2[16] = 18'h10763; T2[17] = 18'h09fe7; T2[18] = 18'h0d35c; T2[19] = 18'h10e24;
+ T2[20] = 18'h1527d; T2[21] = 18'h115b3; T2[22] = 18'h05443; T2[23] = 18'h1190a;
+ T2[24] = 18'h0fcc3; T2[25] = 18'h115e2; T2[26] = 18'h0a398; T2[27] = 18'h0608d;
+ T2[28] = 18'h13075; T2[29] = 18'h0d816; T2[30] = 18'h0bb4c; T2[31] = 18'h04e8a;
+ //
+ AB[ 0] = 18'h0be4e; AB[ 1] = 18'h0fed7; AB[ 2] = 18'h09496; AB[ 3] = 18'h07181;
+ AB[ 4] = 18'h0ee73; AB[ 5] = 18'h04692; AB[ 6] = 18'h0141a; AB[ 7] = 18'h0078c;
+ AB[ 8] = 18'h030eb; AB[ 9] = 18'h0217c; AB[10] = 18'h0696f; AB[11] = 18'h0a165;
+ AB[12] = 18'h0b753; AB[13] = 18'h04af9; AB[14] = 18'h0ed7c; AB[15] = 18'h079ce;
+ AB[16] = 18'h0e863; AB[17] = 18'h097df; AB[18] = 18'h07984; AB[19] = 18'h048af;
+ AB[20] = 18'h0197f; AB[21] = 18'h0206a; AB[22] = 18'h027e7; AB[23] = 18'h04b3a;
+ AB[24] = 18'h03312; AB[25] = 18'h03b56; AB[26] = 18'h04487; AB[27] = 18'h0bd6a;
+ AB[28] = 18'h04e4b; AB[29] = 18'h069ca; AB[30] = 18'h0f994; AB[31] = 18'h0dd4e;
+ AB[32] = 18'h1b024; AB[33] = 18'h0127f; AB[34] = 18'h02631; AB[35] = 18'h0186b;
+ AB[36] = 18'h03adb; AB[37] = 18'h05368; AB[38] = 18'h059a5; AB[39] = 18'h002e0;
+ AB[40] = 18'h0b78a; AB[41] = 18'h016f3; AB[42] = 18'h0b58d; AB[43] = 18'h03ddb;
+ AB[44] = 18'h078b0; AB[45] = 18'h0073b; AB[46] = 18'h07337; AB[47] = 18'h0c7b0;
+ AB[48] = 18'h00668; AB[49] = 18'h0106d; AB[50] = 18'h01a44; AB[51] = 18'h05ee3;
+ AB[52] = 18'h0462d; AB[53] = 18'h0fdeb; AB[54] = 18'h05f85; AB[55] = 18'h02af9;
+ AB[56] = 18'h0e1c0; AB[57] = 18'h00989; AB[58] = 18'h01201; AB[59] = 18'h0e194;
+ AB[60] = 18'h07f93; AB[61] = 18'h0e739; AB[62] = 18'h07cf6; AB[63] = 18'h019df;
+ //
+ end
+
+
+ //
+ // BRAMs
+ //
+ reg tb_fat_bram_xy_ena = 1'b0;
+ reg [ 2:0] tb_fat_bram_xy_bank;
+ reg [ 7:0] tb_fat_bram_xy_addr;
+ reg [17:0] tb_fat_bram_x_din;
+ reg [17:0] tb_fat_bram_y_din;
+
+ reg mgr_fat_bram_xy_ena = 1'b0;
+ reg [ 2:0] mgr_fat_bram_xy_bank;
+ reg [ 7:0] mgr_fat_bram_xy_addr;
+ reg [17:0] mgr_fat_bram_x_din;
+ reg [17:0] mgr_fat_bram_y_din;
+
+ reg mac_fat_bram_xy_ena = 1'b0;
+ reg mac_fat_bram_xy_reg_ena = 1'b0;
+ reg [ 2:0] mac_fat_bram_xy_bank;
+ reg [ 7:0] mac_fat_bram_xy_addr[0:3];
+ wire [17:0] mac_fat_bram_x_dout[0:3];
+ wire [17:0] mac_fat_bram_y_dout[0:3];
+
+ reg tb_slim_bram_xy_ena = 1'b0;
+ reg [ 1:0] tb_slim_bram_xy_bank;
+ reg [ 7:0] tb_slim_bram_xy_addr;
+ reg [17:0] tb_slim_bram_x_din;
+ reg [17:0] tb_slim_bram_y_din;
+
+ reg mac_slim_bram_xy_ena = 1'b0;
+ reg mac_slim_bram_xy_reg_ena = 1'b0;
+ reg [ 1:0] mac_slim_bram_xy_bank;
+ reg [ 7:0] mac_slim_bram_xy_addr;
+ reg [ 7:0] mac_slim_bram_xy_addr_dly;
+ wire [17:0] mac_slim_bram_x_dout;
+ wire [17:0] mac_slim_bram_y_dout;
+
+ always @(posedge clk)
+ //
+ mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr;
+
+ reg mac_slim_bram_xy_reg_ena_dly = 1'b0;
+ always @(posedge clk)
+ mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena;
+
+
+
+ genvar z;
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ begin : gen_fat_bram
+ //
+ ip_bram_36k fat_bram_x
+ (
+ .clka (clk),
+ .ena (mgr_fat_bram_xy_ena),
+ .wea (mgr_fat_bram_xy_ena),
+ .addra ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+ .dina (mgr_fat_bram_x_din),
+
+ .clkb (clk),
+ .enb (mac_fat_bram_xy_ena),
+ .regceb (mac_fat_bram_xy_reg_ena),
+ .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+ .doutb (mac_fat_bram_x_dout[z])
+ );
+ //
+ ip_bram_36k fat_bram_y
+ (
+ .clka (clk),
+ .ena (mgr_fat_bram_xy_ena),
+ .wea (mgr_fat_bram_xy_ena),
+ .addra ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+ .dina (mgr_fat_bram_y_din),
+
+ .clkb (clk),
+ .enb (mac_fat_bram_xy_ena),
+ .regceb (mac_fat_bram_xy_reg_ena),
+ .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+ .doutb (mac_fat_bram_y_dout[z])
+ );
+ //
+ end
+ endgenerate
+
+ ip_bram_18k slim_bram_x
+ (
+ .clka (clk),
+ .ena (tb_slim_bram_xy_ena),
+ .wea (tb_slim_bram_xy_ena),
+ .addra ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+ .dina (tb_slim_bram_x_din),
+
+ .clkb (clk),
+ .enb (mac_slim_bram_xy_ena),
+ .regceb (mac_slim_bram_xy_reg_ena),
+ .addrb ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+ .doutb (mac_slim_bram_x_dout)
+ );
+
+ ip_bram_18k slim_bram_y
+ (
+ .clka (clk),
+ .ena (tb_slim_bram_xy_ena),
+ .wea (tb_slim_bram_xy_ena),
+ .addra ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+ .dina (tb_slim_bram_y_din),
+
+ .clkb (clk),
+ .enb (mac_slim_bram_xy_ena),
+ .regceb (mac_slim_bram_xy_reg_ena),
+ .addrb ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+ .doutb (mac_slim_bram_y_dout)
+ );
+
+
+
+ //
+ // Enable, Ready
+ //
+ reg ena = 1'b0;
+
+ integer i;
+ initial begin
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ rst = 1'b0;
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ tb_fat_bram_xy_ena = 1'b1;
+ tb_slim_bram_xy_ena = 1'b1;
+
+ for (i=0; i<32; i=i+1) begin
+ tb_fat_bram_xy_bank = BANK_FAT_T1T2;
+ tb_fat_bram_xy_addr = i[7:0];
+ tb_fat_bram_x_din = T1[i];
+ tb_fat_bram_y_din = T2[i];
+
+ tb_slim_bram_xy_bank = BANK_SLIM_T1T2;
+ tb_slim_bram_xy_addr = i[7:0];
+ tb_slim_bram_x_din = T1[i];
+ tb_slim_bram_y_din = T2[i];
+
+ wait_clock_tick;
+ end
+
+ tb_fat_bram_xy_ena = 1'b0;
+ tb_slim_bram_xy_ena = 1'b0;
+
+ tb_fat_bram_xy_bank = {3{1'bX}};
+ tb_fat_bram_xy_addr = {8{1'bX}};
+ tb_fat_bram_x_din = {18{1'bX}};
+ tb_fat_bram_y_din = {18{1'bX}};
+
+ tb_slim_bram_xy_bank = {2{1'bX}};
+ tb_slim_bram_xy_addr = {8{1'bX}};
+ tb_slim_bram_x_din = {18{1'bX}};
+ tb_slim_bram_y_din = {18{1'bX}};
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ ena = 1'b1;
+ wait_clock_tick;
+ ena = 1'b0;
+
+ for (i=0; i<10000; i=i+1)
+ wait_clock_tick;
+
+ verify_ab;
+
+ end
+
+
+ //
+ // DSPs
+ //
+ reg dsp_x_ce_a;
+ reg dsp_x_ce_b;
+ reg dsp_x_ce_b_dly;
+ reg dsp_x_ce_m;
+ reg dsp_x_ce_p;
+ reg dsp_x_ce_mode;
+
+ reg [8 -1:0] dsp_x_mode_z = {8{1'b1}};
+
+ wire [4*18-1:0] dsp_x_a;
+ reg [1*17-1:0] dsp_x_b;
+ wire [8*47-1:0] dsp_x_p;
+
+ reg dsp_y_ce_a;
+ reg dsp_y_ce_b;
+ reg dsp_y_ce_b_dly;
+ reg dsp_y_ce_m;
+ reg dsp_y_ce_p;
+ reg dsp_y_ce_mode;
+
+ reg [8 -1:0] dsp_y_mode_z = {8{1'b1}};
+
+ wire [4*18-1:0] dsp_y_a;
+ reg [1*17-1:0] dsp_y_b;
+ wire [8*47-1:0] dsp_y_p;
+
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ begin : gen_dsp_xy_a_split
+ assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z];
+ assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z];
+ end
+ endgenerate
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b};
+
+
+ reg [8 -1:0] dsp_xy_mode_z_adv1 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv2 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv3 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv4 = {8{1'b1}};
+
+ dsp_array dsp_x
+ (
+ .clk (clk),
+
+ .ce_a (dsp_x_ce_a),
+ .ce_b (dsp_x_ce_b),
+ .ce_m (dsp_x_ce_m),
+ .ce_p (dsp_x_ce_p),
+ .ce_mode (dsp_x_ce_mode),
+
+ .mode_z (dsp_x_mode_z),
+
+ .a (dsp_x_a),
+ .b (dsp_x_b),
+ .p (dsp_x_p)
+ );
+
+ dsp_array dsp_y
+ (
+ .clk (clk),
+
+ .ce_a (dsp_y_ce_a),
+ .ce_b (dsp_y_ce_b),
+ .ce_m (dsp_y_ce_m),
+ .ce_p (dsp_y_ce_p),
+ .ce_mode (dsp_y_ce_mode),
+
+ .mode_z (dsp_y_mode_z),
+
+ .a (dsp_y_a),
+ .b (dsp_y_b),
+ .p (dsp_y_p)
+ );
+
+
+ //
+ // FSM State and Next States
+ //
+ reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+
+
+ always @(posedge clk)
+ //
+ if (rst) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_state_next;
+
+
+ localparam [7:0] index_last = 8'd31;
+
+
+ wire mult_square_addr_almost_done_comb;
+ reg mult_square_addr_almost_done_flop;
+
+ wire mult_square_addr_surely_done_comb;
+ reg mult_square_addr_surely_done_flop;
+
+ assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == (index_last - 8'd1);
+ assign mult_square_addr_surely_done_comb = mac_slim_bram_xy_addr == index_last;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <=
+ {mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb};
+
+ default:
+ {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00;
+
+ endcase
+
+
+ //
+ // Column
+ //
+ reg [4:0] col_index;
+ reg [4:0] col_index_prev;
+ reg [4:0] col_index_last;
+
+ always @(posedge clk)
+ //
+ col_index_prev <= col_index;
+
+ //
+ // FSM Transition Logic
+ //
+ wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0;
+ default: mac_slim_bram_xy_addr <= 8'dX;
+ endcase
+
+ integer j;
+ always @(posedge clk)
+ //
+ for (j=0; j<(NUM_MULTS/2); j=j+1)
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: mac_fat_bram_xy_addr[j] <= 1 + 2 * j;
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_fat_bram_xy_addr[j] <= 8 * (col_index + 1) + 1 + 2 * j;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
+ default: mac_fat_bram_xy_addr[j] <= 8'dX;
+ endcase
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_bank <= BANK_SLIM_T1T2;
+ default: mac_slim_bram_xy_bank <= 2'bXX;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_bank <= BANK_FAT_T1T2;
+ default: mac_fat_bram_xy_bank <= 3'bXXX;
+ endcase
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_slim_bram_xy_ena <= 1'b1;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop;
+ default: mac_slim_bram_xy_ena <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_ena <= 1'b1;
+ default: mac_fat_bram_xy_ena <= 1'b0;
+ endcase
+
+
+ always @(posedge clk)
+ //
+ mac_slim_bram_xy_reg_ena <= mac_slim_bram_xy_ena;
+
+ always @(posedge clk)
+ //
+ mac_fat_bram_xy_reg_ena <= mac_fat_bram_xy_ena;
+
+
+ always @(posedge clk)
+ //
+ if (mac_slim_bram_xy_reg_ena_dly)
+ {dsp_y_b, dsp_x_b} <= {mac_slim_bram_x_dout[16:0], mac_slim_bram_y_dout[16:0]};
+ else
+ {dsp_y_b, dsp_x_b} <= {2{{17{1'bX}}}};
+
+
+ function [7:0] mac_fat_bram_xy_addr_next;
+ input [7:0] mac_fat_bram_xy_addr_current;
+ input [7:0] mac_fat_bram_xy_addr_last;
+ begin
+ if (mac_fat_bram_xy_addr_current > 0)
+ mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1;
+ else
+ mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last;
+ end
+ endfunction
+
+
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+ task wait_clock_tick;
+ begin
+ #`CLK_PERIOD_NS;
+ end
+ endtask
+
+ //
+ // Increment Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: begin
+ col_index <= 5'd0;
+ col_index_last <= index_last[7:3];
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_INIT:
+ col_index <= col_index + 1'b1;
+ //
+ endcase
+
+ assign fsm_state_after_mult_square = (col_index == col_index_last) ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {8{1'b0}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly);
+ default: dsp_xy_mode_z_adv4 <= {8{1'b1}};
+ endcase
+
+ always @(posedge clk) begin
+ {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}};
+ //
+ dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2};
+ dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3};
+ dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4};
+ end
+
+ function [NUM_MULTS-1:0] calc_mac_mode_z_square;
+ input [ 4:0] col_index_value;
+ input [ 7:0] mac_slim_bram_xy_addr_value;
+ begin
+ if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
+ case (mac_slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_mac_mode_z_square = 8'b11111110;
+ 3'b001: calc_mac_mode_z_square = 8'b11111101;
+ 3'b010: calc_mac_mode_z_square = 8'b11111011;
+ 3'b011: calc_mac_mode_z_square = 8'b11110111;
+ 3'b100: calc_mac_mode_z_square = 8'b11101111;
+ 3'b101: calc_mac_mode_z_square = 8'b11011111;
+ 3'b110: calc_mac_mode_z_square = 8'b10111111;
+ 3'b111: calc_mac_mode_z_square = 8'b01111111;
+ endcase
+ else
+ calc_mac_mode_z_square = {NUM_MULTS{1'b1}};
+ end
+ endfunction
+
+ reg recomb_x_ena = 1'b0;
+ reg recomb_y_ena = 1'b0;
+
+ always @(posedge clk) begin
+ //
+ recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p;
+ recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p;
+ //
+ end
+
+ wire [ 2:0] recomb_fat_bram_xy_bank;
+ wire [ 7:0] recomb_fat_bram_xy_addr;
+ wire [17:0] recomb_fat_bram_x_dout;
+ wire [17:0] recomb_fat_bram_y_dout;
+ wire recomb_fat_bram_xy_dout_valid;
+ wire recomb_rdy;
+
+ modexpng_part_recombinator recomb
+ (
+ .clk (clk),
+ .rdy (recomb_rdy),
+ .fsm_state_next (fsm_state_next),
+ .index_last (index_last),
+ .dsp_x_ce_p (dsp_x_ce_p),
+ .dsp_y_ce_p (dsp_y_ce_p),
+ .ena_x (recomb_x_ena),
+ .ena_y (recomb_y_ena),
+ .dsp_x_p (dsp_x_p),
+ .dsp_y_p (dsp_y_p),
+ .col_index (col_index),
+ .col_index_last (col_index_last),
+ .slim_bram_xy_addr (mac_slim_bram_xy_addr),
+ .fat_bram_xy_bank (recomb_fat_bram_xy_bank),
+ .fat_bram_xy_addr (recomb_fat_bram_xy_addr),
+ .fat_bram_x_dout (recomb_fat_bram_x_dout),
+ .fat_bram_y_dout (recomb_fat_bram_y_dout),
+ .fat_bram_xy_dout_valid (recomb_fat_bram_xy_dout_valid)
+ );
+
+ reg [17:0] AB_READ[0:63];
+
+ always @(posedge clk)
+ //
+ if (recomb_fat_bram_xy_dout_valid)
+ //
+ case (recomb_fat_bram_xy_bank)
+ 3'd1: AB_READ[recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;
+ 3'd2: AB_READ[32 + recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;
+ endcase
+
+
+ always @(posedge clk)
+ //
+ if (tb_fat_bram_xy_ena) begin
+ mgr_fat_bram_xy_ena <= 1'b1;
+ mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank;
+ mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr;
+ mgr_fat_bram_x_din <= tb_fat_bram_x_din;
+ mgr_fat_bram_y_din <= tb_fat_bram_y_din;
+ end else if (recomb_fat_bram_xy_dout_valid) begin
+ mgr_fat_bram_xy_ena <= 1'b1;
+ mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank;
+ mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr;
+ mgr_fat_bram_x_din <= recomb_fat_bram_x_dout;
+ mgr_fat_bram_y_din <= recomb_fat_bram_y_dout;
+ end else begin
+ mgr_fat_bram_xy_ena <= 1'b0;
+ mgr_fat_bram_xy_bank <= 3'bXXX;
+ mgr_fat_bram_xy_addr <= 8'hXX;
+ mgr_fat_bram_x_din <= {18{1'bX}};
+ mgr_fat_bram_y_din <= {18{1'bX}};
+ end
+
+
+
+
+
+ task verify_ab;
+ reg verify_ab_ok;
+ begin
+ verify_ab_ok = 1;
+ for (i=0; i<64; i=i+1)
+ if (AB_READ[i] === AB[i])
+ $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]);
+ else begin
+ $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x <???>", i, AB[i], AB_READ[i]);
+ verify_ab_ok = 0;
+ end
+ if (verify_ab_ok)
+ $display("AB is OK.");
+ else
+ $display("AB is WRONG!");
+ end
+ endtask
+
+
+
+ always @* begin
+ //
+ fsm_state_next = FSM_STATE_IDLE;
+ //
+ case (fsm_state)
+ FSM_STATE_IDLE: fsm_state_next = ena ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
+
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+
+ FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF;
+
+ default: fsm_state_next = FSM_STATE_IDLE ;
+
+ endcase
+ //
+ end
+
+
+endmodule
+
diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt
new file mode 100644
index 0000000..987bd86
--- /dev/null
+++ b/rtl/dev/temp.txt
@@ -0,0 +1,384 @@
+ //
+ // Helper Functions
+ //
+ /*
+ function [INDEX_WIDTH-1:0] calc_preset_a_index;
+ input [INDEX_WIDTH-4:0] col_in;
+ input integer x_in;
+ integer index_out;
+ begin
+ index_out = col_in * NUM_MULTS + x_in;
+ calc_preset_a_index = index_out[INDEX_WIDTH-1:0];
+ end
+ endfunction
+
+ function [INDEX_WIDTH-1:0] calc_rotate_a_index;
+ input [INDEX_WIDTH-1:0] current_index_in;
+ input [INDEX_WIDTH-1:0] last_index_in;
+ begin
+ if (current_index_in > {INDEX_WIDTH{1'b0}})
+ calc_rotate_a_index = current_index_in - 1'b1;
+ else
+ calc_rotate_a_index = last_index_in;
+ end
+ endfunction
+ */
+
+ /*
+ //
+ // Narrow Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_reg;
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_dly;
+ localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ?
+ din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero;
+ wire din_addr_narrow_done = din_addr_narrow_reg == index_last;
+
+ assign din_addr_narrow = din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ din_addr_narrow_dly <= din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ endcase
+
+
+ //
+ // Helper Functions
+ //
+ function [NUM_MULTS-1:0] calc_mac_clear_bitmask;
+ input [2:0] t;
+ begin
+ case (t)
+ 3'd0: calc_mac_clear_bitmask = 8'b00000001;
+ 3'd1: calc_mac_clear_bitmask = 8'b00000010;
+ 3'd2: calc_mac_clear_bitmask = 8'b00000100;
+ 3'd3: calc_mac_clear_bitmask = 8'b00001000;
+ 3'd4: calc_mac_clear_bitmask = 8'b00010000;
+ 3'd5: calc_mac_clear_bitmask = 8'b00100000;
+ 3'd6: calc_mac_clear_bitmask = 8'b01000000;
+ 3'd7: calc_mac_clear_bitmask = 8'b10000000;
+ endcase
+ end
+ endfunction
+
+ function [NUM_MULTS:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] current_col_index;
+ input [INDEX_WIDTH-1:0] b_addr_prev;
+ begin
+ if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index)
+ calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])};
+ else
+ calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}};
+ end
+ endfunction
+
+
+ //
+ // Wide Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1];
+
+ integer xi;
+ always @(posedge clk)
+ //
+ for (xi=0; xi<NUM_MULTS; xi=xi+1)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(0, xi);
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(col_index + 1'b1, xi);
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_wide_reg[xi] <= calc_rotate_a_index(din_addr_wide_reg[xi], index_last);
+ //
+ endcase
+
+
+ //
+ // Enables
+ //
+ reg din_ena_narrow_reg = 1'b0;
+ reg [NUM_MULTS-1:0] din_ena_wide_reg = {NUM_MULTS{1'b0}};
+
+ assign din_ena_narrow = din_ena_narrow_reg;
+ assign din_ena_wide = din_ena_wide_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_narrow_reg <= 1'b0;
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_narrow_reg <= 1'b1;
+ default: din_ena_narrow_reg <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_wide_reg <= {NUM_MULTS{1'b1}};
+ default: din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ endcase
+
+
+ //
+ // Modes
+ //
+ reg [2-1:0] din_mode_wide_reg;
+ reg [2-1:0] din_mode_narrow_reg;
+ reg [2-1:0] dout_mode_wide_reg;
+ reg [2-1:0] dout_mode_narrow_reg;
+
+ assign din_mode_wide = din_mode_wide_reg;
+ assign din_mode_narrow = din_mode_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_wide_reg <= MODEXPNG_MODE_A;
+ default: din_mode_wide_reg <= 2'bXX;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_narrow_reg <= MODEXPNG_MODE_B;
+ default: din_mode_narrow_reg <= 2'bXX;
+ endcase
+
+
+ //
+ // MAC Array
+ //
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_a[0:NUM_MULTS];
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_b;
+ reg [ NUM_MULTS :0] mac_ce;
+ reg [ NUM_MULTS :0] mac_clr;
+ wire [ MODEXPNG_MAC_WIDTH-1:0] mac_p[0:NUM_MULTS];
+ reg [ NUM_MULTS :0] mac_rdy_lsb;
+ reg [ NUM_MULTS :0] mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1:0];
+
+ //reg [ NUM_MULTS :0] mac_ce_dly[MODEXPNG_MAC_LATENCY-1:0];
+ //wire [ NUM_MULTS :0] mac_rdy;
+
+
+
+
+
+ assign mac_din_b = din_narrow;
+
+
+ genvar x;
+ generate for (x=0; x<=NUM_MULTS; x=x+1)
+ begin : gen_macs
+ //
+ //assign mac_rdy[x] = mac_ce_dly[MODEXPNG_MAC_LATENCY-1][x];
+ //
+ modexpng_mac mac_inst
+ (
+ .clk (clk),
+ .ce (mac_ce[x]),
+ .clr (mac_clr[x]),
+ .a (mac_din_a[x]),
+ .b (mac_din_b),
+ .p (mac_p[x])
+ );
+ //
+ end
+ //
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_mac_din_a
+ //
+ assign mac_din_a[x] = din_wide[x*MODEXPNG_WORD_WIDTH+:MODEXPNG_WORD_WIDTH];
+ //
+ end
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_din_addr_wide
+ //
+ assign din_addr_wide[x*INDEX_WIDTH+:INDEX_WIDTH] = din_addr_wide_reg[x];
+ //
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_ce <= {1'b0, {NUM_MULTS{1'b1}}};
+ default: mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ endcase
+
+
+ //
+ // MAC Valid Logic
+ //
+ integer y;
+
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_rdy_lsb_dly[0][xi] <= mac_rdy_lsb[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_rdy_lsb_dly[y][xi] <= mac_rdy_lsb_dly[y-1][xi];
+ end
+
+ always @(posedge clk) begin
+ //
+ fsm_state_dly[0] <= fsm_state;
+ for (y=1; y<=MODEXPNG_MAC_LATENCY; y=y+1)
+ fsm_state_dly[y] <= fsm_state_dly[y-1];
+ end
+
+ */
+
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_ce_dly[0][xi] <= mac_ce[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_ce_dly[y][xi] <= mac_ce_dly[y-1][xi];
+ end
+ */
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_clr_dly[0][xi] <= mac_clr[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_clr_dly[y][xi] <= mac_clr_dly[y-1][xi];
+ end
+ */
+
+ /*
+ //
+ // MAC Clear Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_clr <= {1'b0, {NUM_MULTS{1'b1}}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_clr <= calc_mac_clear_square(col_index, din_addr_narrow_dly);
+ default: mac_clr <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+ //
+ // MAC Ready Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_rdy_lsb <= calc_mac_clear_square(col_index, din_addr_narrow);
+ default: mac_rdy_lsb <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+
+ //
+ // Recombinators
+ //
+ reg rcmb_lsb_ce;
+ reg rcmb_lsb_clr;
+ reg [MODEXPNG_MAC_WIDTH-1: 0] rcmb_lsb_din;
+ wire [15: 0] rcmb_lsb_dout;
+
+ modexpng_part_recombinator recomb_lsb
+ (
+ .clk (clk),
+ .ce (rcmb_lsb_ce),
+ .clr (rcmb_lsb_clr),
+ .din (rcmb_lsb_din),
+ .dout (rcmb_lsb_dout)
+ );
+
+
+ reg calc_rcmb_lsb_ce;
+ always @*
+ //
+ calc_rcmb_lsb_ce = | mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0];
+
+ reg [MODEXPNG_MAC_WIDTH-1:0] calc_rcmb_lsb_din;
+
+ always @*
+ //
+ casez (mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0])
+ 8'b00000001: calc_rcmb_lsb_din = mac_p[0];
+ 8'b00000010: calc_rcmb_lsb_din = mac_p[1];
+ 8'b00000100: calc_rcmb_lsb_din = mac_p[2];
+ 8'b00001000: calc_rcmb_lsb_din = mac_p[3];
+ 8'b00010000: calc_rcmb_lsb_din = mac_p[4];
+ 8'b00100000: calc_rcmb_lsb_din = mac_p[5];
+ 8'b01000000: calc_rcmb_lsb_din = mac_p[6];
+ 8'b10000000: calc_rcmb_lsb_din = mac_p[7];
+ default: calc_rcmb_lsb_din = {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0)
+ rcmb_lsb_ce <= 1'b0;
+ else case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_ce <= calc_rcmb_lsb_ce;
+ default: rcmb_lsb_ce <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: rcmb_lsb_clr <= 1'b1;
+ default: rcmb_lsb_clr <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_din <= calc_rcmb_lsb_din;
+ default: rcmb_lsb_din <= {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+
+
+*/
diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v
new file mode 100644
index 0000000..178f87f
--- /dev/null
+++ b/rtl/dsp/dsp_array.v
@@ -0,0 +1,111 @@
+module dsp_array
+(
+ input clk,
+
+ input ce_a,
+ input ce_b,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+
+ input [8 -1:0] mode_z,
+
+ input [4*18-1:0] a,
+ input [1*17-1:0] b,
+ output [8*47-1:0] p
+);
+
+ `include "../modexpng_parameters_x8.vh"
+
+ wire [17:0] casc_a[0:3];
+ wire [16:0] casc_b[0:3];
+
+ wire ce_a0 = ce_a;
+ reg ce_a1 = 1'b0;
+ reg ce_a2 = 1'b0;
+
+ wire ce_b0 = ce_b;
+ reg ce_b1 = 1'b0;
+
+ always @(posedge clk) begin
+ ce_a1 <= ce_a0;
+ ce_a2 <= ce_a1;
+ ce_b1 <= ce_b0;
+ end
+
+
+ genvar z;
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ //
+ begin : DSP48E1
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("DIRECT"),
+ .B_REG(2)
+ )
+ dsp_direct
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a0),
+ .ce_b1 (ce_b0),
+ .ce_a2 (ce_a1),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*2*z+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in ({17{1'b0}}),
+ .casc_b_in ({17{1'b0}}),
+
+ .casc_a_out (casc_a[z]),
+ .casc_b_out (casc_b[z])
+ );
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("CASCADE"),
+ .B_REG(1)
+ )
+ dsp_cascade
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a1),
+ .ce_b1 (1'b0),
+ .ce_a2 (ce_a2),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*(2*z+1)+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in (casc_a[z]),
+ .casc_b_in (casc_b[z]),
+
+ .casc_a_out (),
+ .casc_b_out ()
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+endmodule
diff --git a/rtl/dsp/dsp_slice.v b/rtl/dsp/dsp_slice.v
new file mode 100644
index 0000000..9f1298b
--- /dev/null
+++ b/rtl/dsp/dsp_slice.v
@@ -0,0 +1,125 @@
+module dsp_slice #
+(
+ AB_INPUT = "DIRECT",
+ B_REG = 2
+)
+(
+ input clk,
+ input ce_a1,
+ input ce_b1,
+ input ce_a2,
+ input ce_b2,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+ input [17:0] a,
+ input [16:0] b,
+ output [46:0] p,
+ input [ 4:0] inmode,
+ input [ 6:0] opmode,
+ input [ 3:0] alumode,
+ input [17:0] casc_a_in,
+ input [16:0] casc_b_in,
+ output [17:0] casc_a_out,
+ output [16:0] casc_b_out
+);
+
+ wire [30-18-1:0] casc_a_dummy;
+ wire [18-17-1:0] casc_b_dummy;
+ wire [48-47-1:0] p_dummy;
+
+ DSP48E1 #
+ (
+ .AREG (2),
+ .BREG (B_REG),
+ .CREG (0),
+ .DREG (0),
+ .ADREG (0),
+ .MREG (1),
+ .PREG (1),
+ .ACASCREG (1),
+ .BCASCREG (1),
+ .INMODEREG (0),
+ .OPMODEREG (1),
+ .ALUMODEREG (0),
+ .CARRYINREG (0),
+ .CARRYINSELREG (0),
+
+ .A_INPUT (AB_INPUT),
+ .B_INPUT (AB_INPUT),
+
+ .USE_DPORT ("FALSE"),
+ .USE_MULT ("DYNAMIC"),
+ .USE_SIMD ("ONE48"),
+
+ .MASK (48'h3fffffffffff),
+ .PATTERN (48'h000000000000),
+ .SEL_MASK ("MASK"),
+ .SEL_PATTERN ("PATTERN"),
+
+ .USE_PATTERN_DETECT ("NO_PATDET"),
+ .AUTORESET_PATDET ("NO_RESET")
+ )
+ DSP48E1_inst
+ (
+ .CLK (clk),
+
+ .CEA1 (ce_a1),
+ .CEB1 (ce_b1),
+ .CEA2 (ce_a2),
+ .CEB2 (ce_b2),
+ .CEAD (1'b0),
+ .CEC (1'b0),
+ .CED (1'b0),
+ .CEM (ce_m),
+ .CEP (ce_p),
+ .CEINMODE (1'b0),
+ .CECTRL (ce_mode),
+ .CEALUMODE (1'b0),
+ .CECARRYIN (1'b0),
+
+ .A ({{(30-18){1'b0}}, a}),
+ .B ({{(18-17){1'b0}}, b}),
+ .C ({48{1'b0}}),
+ .D ({25{1'b0}}),
+ .P ({p_dummy, p}),
+
+ .INMODE (inmode),
+ .OPMODE (opmode),
+ .ALUMODE (alumode),
+
+ .ACIN ({{(30-18){1'b0}}, casc_a_in}),
+ .BCIN ({{(18-17){1'b0}}, casc_b_in}),
+ .ACOUT ({casc_a_dummy, casc_a_out}),
+ .BCOUT ({casc_b_dummy, casc_b_out}),
+ .PCIN ({48{1'b0}}),
+ .PCOUT (),
+ .CARRYCASCIN (1'b0),
+ .CARRYCASCOUT (),
+
+ .RSTA (1'b0),
+ .RSTB (1'b0),
+ .RSTC (1'b0),
+ .RSTD (1'b0),
+ .RSTM (1'b0),
+ .RSTP (1'b0),
+ .RSTINMODE (1'b0),
+ .RSTCTRL (1'b0),
+ .RSTALUMODE (1'b0),
+ .RSTALLCARRYIN (1'b0),
+
+ .UNDERFLOW (),
+ .OVERFLOW (),
+ .PATTERNDETECT (),
+ .PATTERNBDETECT (),
+
+ .CARRYIN (1'b0),
+ .CARRYOUT (),
+ .CARRYINSEL (3'b000),
+
+ .MULTSIGNIN (1'b0),
+ .MULTSIGNOUT ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v
new file mode 100644
index 0000000..9105dab
--- /dev/null
+++ b/rtl/modexpng_mac.v
@@ -0,0 +1,54 @@
+module modexpng_mac
+(
+ clk,
+ ce, clr,
+ casc_a,
+ a_in, b_in, p_out,
+ a_casc_in, a_casc_out
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input casc_a;
+ input [16:0] a_in;
+ input [16:0] b_in;
+ output [46:0] p_out;
+ input [16:0] a_casc_in;
+ output [16:0] a_casc_out;
+
+ reg [16:0] a_reg;
+ reg [16:0] b_reg;
+ assign a_casc_out = a_reg;
+ always @(posedge clk)
+ //
+ if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in};
+
+ reg ce_dly1;
+ reg ce_dly2;
+ always @(posedge clk)
+ //
+ {ce_dly2, ce_dly1} <= {ce_dly1, ce};
+
+ reg clr_dly1;
+ reg clr_dly2;
+ always @(posedge clk) begin
+ //
+ if (ce) clr_dly1 <= clr;
+ if (ce_dly1) clr_dly2 <= clr_dly1;
+ //
+ end
+
+ reg [33:0] m_reg;
+ wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg};
+ always @(posedge clk)
+ //
+ if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg};
+
+ reg [46:0] p_reg;
+ assign p_out = p_reg;
+ always @(posedge clk)
+ //
+ if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext;
+
+endmodule
diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v
new file mode 100644
index 0000000..067929e
--- /dev/null
+++ b/rtl/modexpng_mac_array.v
@@ -0,0 +1,116 @@
+module modexpng_mac_array
+(
+ clk,
+ ce, clr,
+ ce_aux, clr_aux,
+ casc_a, casc_a_aux,
+ a_in, b_in, p_out,
+ a_in_aux, p_out_aux
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ce;
+ input [NUM_MULTS -1:0] clr;
+ input ce_aux;
+ input clr_aux;
+ input [NUM_MULTS -2:0] casc_a;
+ input casc_a_aux;
+ input [NUM_MULTS * WORD_WIDTH -1:0] a_in;
+ input [ 1 * WORD_WIDTH -1:0] b_in;
+ output [NUM_MULTS * MAC_WIDTH -1:0] p_out;
+ input [ 1 * WORD_WIDTH -1:0] a_in_aux;
+ output [ 1 * MAC_WIDTH -1:0] p_out_aux;
+
+
+ //
+ // A-Cascade Paths
+ //
+ wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2];
+ wire [WORD_WIDTH-1:0] a_casc_int_aux;
+
+
+ //
+ // LSB
+ //
+ modexpng_mac mac_lsb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[0]),
+ .casc_a (1'b0),
+ .a_in (a_in[0+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[0+:MAC_WIDTH]),
+ .a_casc_in ({WORD_WIDTH{1'b0}}),
+ .a_casc_out (a_casc_int[0])
+ );
+
+
+ //
+ // INT
+ //
+ genvar z;
+ generate for (z=1; z<(NUM_MULTS-1); z=z+1)
+ begin : gen_modexpng_mac_int
+ modexpng_mac mac_int
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[z]),
+ .casc_a (casc_a[z-1]),
+ .a_in (a_in[z*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[z*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[z-1]),
+ .a_casc_out (a_casc_int[z])
+ );
+ end
+ endgenerate
+
+
+ //
+ // MSB
+ //
+ modexpng_mac mac_msb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[NUM_MULTS-1]),
+ .casc_a (casc_a[NUM_MULTS-2]),
+ .a_in (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[NUM_MULTS-2]),
+ .a_casc_out (a_casc_int_aux)
+ );
+
+
+ //
+ // AUX
+ //
+ modexpng_mac mac_aux
+ (
+ .clk (clk),
+ .ce (ce_aux),
+ .clr (clr_aux),
+ .casc_a (casc_a_aux),
+ .a_in (a_in_aux),
+ .b_in (b_in),
+ .p_out (p_out_aux),
+ .a_casc_in (a_casc_int_aux),
+ .a_casc_out ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v
new file mode 100644
index 0000000..ca89214
--- /dev/null
+++ b/rtl/modexpng_mem.v
@@ -0,0 +1,93 @@
+//
+// TODO: Add license text!
+//
+
+module modexpng_mem #
+(
+ parameter MEM_WIDTH = 17,
+ parameter MEM_ADDR_BITS = 6
+)
+(
+ input clk,
+
+ input [MEM_ADDR_BITS-1:0] a_addr,
+ input a_en,
+ input a_wr,
+ input [MEM_WIDTH -1:0] a_in,
+ output [MEM_WIDTH -1:0] a_out,
+
+ input [MEM_ADDR_BITS-1:0] b_addr,
+ input b_en,
+ input b_reg_en,
+ output [MEM_WIDTH -1:0] b_out
+);
+
+
+ //
+ // BRAM
+ //
+ (* RAM_STYLE="BLOCK" *)
+ reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+ //
+ // Initialization for Simulation
+ //
+ /*
+ integer c;
+ initial begin
+ for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
+ bram[c] = {MEM_WIDTH{1'b0}};
+ end
+ */
+
+
+
+ //
+ // Output Registers
+ //
+ reg [MEM_WIDTH-1:0] bram_b;
+ reg [MEM_WIDTH-1:0] bram_b_reg;
+
+ assign a_out = 32'hDEADCE11;
+ assign b_out = bram_b_reg;
+
+
+ //
+ // Note, that when both ports are accessing the same location, conflict can
+ // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict
+ // Avoidance") for more information. In our configuration to avoid that the
+ // write port must be coded to operate in READ_FIRST mode. If the write
+ // port is overwriting the same address the read port is accessing, the
+ // write port must read the previously stored data (not the data it is
+ // writing, as that would be WRITE_FIRST mode).
+ //
+
+
+ //
+ // Write-Only Port A
+ //
+ always @(posedge clk)
+ //
+ if (a_en)
+ //
+ if (a_wr) bram[a_addr] <= a_in;
+
+
+ //
+ // Read-Only Port B
+ //
+ always @(posedge clk)
+ //
+ if (b_en)
+ //
+ bram_b <= bram[b_addr];
+
+ always @(posedge clk)
+ //
+ if (b_reg_en)
+ //
+ bram_b_reg <= bram_b;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v
new file mode 100644
index 0000000..b904795
--- /dev/null
+++ b/rtl/modexpng_mmm_col_index.v
@@ -0,0 +1,90 @@
+module modexpng_mmm_col_index
+(
+ clk,
+ index_last,
+ fsm_state_next,
+ col_index,
+ col_index_done,
+ col_index_zero,
+ col_index_next,
+ col_index_prev
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output [ INDEX_WIDTH-4:0] col_index;
+ output col_index_done;
+ output [ INDEX_WIDTH-4:0] col_index_zero;
+ output [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] col_index_prev;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] col_index_reg;
+ reg [INDEX_WIDTH-4:0] col_index_last;
+ reg [INDEX_WIDTH-4:0] col_index_dly;
+
+
+ //
+ // Mapping
+ //
+ assign col_index = col_index_reg;
+ assign col_index_prev = col_index_dly;
+
+
+ //
+ // Handy Wires
+ //
+ assign col_index_done = col_index == col_index_last;
+ assign col_index_zero = {(INDEX_WIDTH-3){1'b0}};
+ assign col_index_next = col_index + 1'b1;
+
+
+ //
+ // Increment Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ col_index_reg <= col_index_zero;
+ col_index_last <= index_last[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ col_index_reg <= col_index_next;
+ //
+ endcase
+
+
+ //
+ // Delay Logic
+ //
+ always @(posedge clk)
+ //
+ col_index_dly <= col_index;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v
new file mode 100644
index 0000000..565c7e0
--- /dev/null
+++ b/rtl/modexpng_mmm_din_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_din_addr
+(
+ clk, rst_n,
+ index_last,
+ fsm_state_next,
+ col_index_zero, col_index_next,
+ din_addr, din_bank, din_ena, din_reg_ena,
+ din_addr_cnt, din_addr_cnt_last,
+ din_addr_cnt_lower_prev, din_addr_cnt_upper_prev
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [ INDEX_WIDTH-4:0] col_index_zero;
+ input [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] din_addr;
+ output [ 3-1:0] din_bank;
+ output [ 1-1:0] din_ena;
+ output [ 1-1:0] din_reg_ena;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt_last;
+ output [ 3-1:0] din_addr_cnt_lower_prev;
+ output [ INDEX_WIDTH-4:0] din_addr_cnt_upper_prev;
+
+
+ //
+ // Address
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}};
+ reg [INDEX_WIDTH-1:0] din_addr_last;
+ wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1;
+
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1;
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_last_reg;
+ wire [ 3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[ 3-1:0];
+ wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3];
+ reg [ 3-1:0] din_addr_cnt_lower_dly;
+ reg [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly;
+
+ reg [ 3-1:0] din_bank_reg;
+
+
+ //
+ // Enables
+ //
+ reg din_ena_reg = 1'b0;
+ reg din_reg_ena_reg = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_ena_reg <= 1'b0;
+ else case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_ena_reg <= 1'b1;
+ //
+ default:
+ din_ena_reg <= 1'b0;
+ //
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_reg_ena_reg <= 1'b0;
+ else
+ din_reg_ena_reg <= din_ena_reg;
+
+
+ //
+ // Address Mapping
+ //
+ assign din_addr = din_addr_reg[INDEX_WIDTH-1:3];
+
+ assign din_addr_cnt = din_addr_cnt_reg;
+ assign din_addr_cnt_last = din_addr_cnt_last_reg;
+ assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly;
+ assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly;
+
+ assign din_bank = din_bank_reg;
+
+
+ //
+ // Enable Mapping
+ //
+ assign din_ena = din_ena_reg;
+ assign din_reg_ena = din_reg_ena_reg;
+
+
+ //
+ // Delay
+ //
+ always @(posedge clk) begin
+ din_addr_cnt_lower_dly <= din_addr_cnt_lower;
+ din_addr_cnt_upper_dly <= din_addr_cnt_upper;
+ end
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ din_addr_reg <= {col_index_zero, {3{1'b0}}};
+ din_addr_last <= index_last;
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ din_addr_cnt_last_reg <= index_last;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin
+ din_addr_reg <= {col_index_next, {3{1'b0}}};
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ din_addr_reg <= din_addr_prev;
+ din_addr_cnt_reg <= din_addr_cnt_next;
+ end
+ //
+ //default:
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_bank_reg = BANK_XY_T1T2;
+ //
+ default:
+ din_bank_reg = BANK_XY_ANY;
+ //
+ endcase
+
+endmodule
diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v
new file mode 100644
index 0000000..3749d82
--- /dev/null
+++ b/rtl/modexpng_mmm_dout_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_dout_addr
+(
+ clk, rst_n,
+ //index_last,
+ fsm_state,
+ load_xy_addr,
+ load_addr_zero,
+ load_nn_coeff_addr_done,
+ /*
+
+ col_index_zero, col_index_next,*/
+ x_dout_addr, y_dout_addr,
+ x_dout_ena, y_dout_ena,
+ x_dout_bank, y_dout_bank
+
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ //input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [INDEX_WIDTH:0] load_xy_addr; // address
+ input load_addr_zero;
+ input load_nn_coeff_addr_done;
+ //input [ INDEX_WIDTH-4:0] col_index_zero;
+ //input [ INDEX_WIDTH-4:0] col_index_next;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2
+ reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2
+
+ reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}};
+ reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}};
+
+ reg [NUM_MULTS-1:0] x_dout_ena_int;
+ reg [NUM_MULTS-1:0] y_dout_ena_int;
+
+ reg [3-1:0] x_dout_bank_reg;
+ reg [3-1:0] y_dout_bank_reg;
+
+
+ //
+ // Mapping
+ //
+ assign x_dout_addr = x_dout_addr_reg;
+ assign y_dout_addr = y_dout_addr_reg;
+
+ assign x_dout_ena = x_dout_ena_reg;
+ assign y_dout_ena = y_dout_ena_reg;
+
+ assign x_dout_bank = x_dout_bank_reg;
+ assign y_dout_bank = y_dout_bank_reg;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ end
+ //
+ default: begin
+ x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ end
+ //
+ endcase
+
+ wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1};
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_ena_reg <= x_dout_ena_int;
+ y_dout_ena_reg <= y_dout_ena_int;
+ end
+ //
+ default: begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_bank_reg <= BANK_X_T1;
+ y_dout_bank_reg <= BANK_Y_T2;
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N : BANK_XY_AUX;
+ y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX;
+ end
+ //
+ default: begin
+ x_dout_bank_reg <= BANK_XY_ANY;
+ y_dout_bank_reg <= BANK_XY_ANY;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh
new file mode 100644
index 0000000..c237a0b
--- /dev/null
+++ b/rtl/modexpng_mmm_fsm.vh
@@ -0,0 +1,24 @@
+localparam FSM_STATE_WIDTH = 32;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_IDLE = 0;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_1 = 1;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_2 = 2;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_3 = 3;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_1 = 4;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_2 = 5;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_3 = 6;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_INIT = 11;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_TRIG = 12;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_BUSY = 13;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_INIT = 14;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999;
+
\ No newline at end of file
diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v
new file mode 100644
index 0000000..a2a21ff
--- /dev/null
+++ b/rtl/modexpng_mmm_pad.v
@@ -0,0 +1,153 @@
+module modexpng_mmm_pad
+(
+ clk, rst_n,
+ fsm_state,
+ load_xy_addr_lsb,
+ pad_x_rd_addr, pad_y_rd_addr,
+ pad_x_rd_ena, pad_y_rd_ena,
+ pad_x_rd_dout, pad_y_rd_dout,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+
+ input [INDEX_WIDTH-1:0] load_xy_addr_lsb;
+
+ input [WORD_WIDTH-1:0] load_x_din;
+ input [WORD_WIDTH-1:0] load_y_din;
+
+ input [INDEX_WIDTH-1:0] pad_x_rd_addr;
+ input [INDEX_WIDTH-1:0] pad_y_rd_addr;
+
+ input pad_x_rd_ena;
+ input pad_y_rd_ena;
+
+ output [WORD_WIDTH-1:0] pad_x_rd_dout;
+ output [WORD_WIDTH-1:0] pad_y_rd_dout;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-1:0] pad_x_wr_addr;
+ reg [INDEX_WIDTH-1:0] pad_y_wr_addr;
+ reg pad_x_wr_ena;
+ reg pad_y_wr_ena;
+ reg [ WORD_WIDTH-1:0] pad_x_wr_din;
+ reg [ WORD_WIDTH-1:0] pad_y_wr_din;
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_x
+ (
+ .clk (clk),
+
+ .a_addr (pad_x_wr_addr),
+ .a_en (pad_x_wr_ena),
+ .a_wr (pad_x_wr_ena),
+ .a_in (pad_x_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_x_rd_addr),
+ .b_en (pad_x_rd_ena),
+ .b_out (pad_x_rd_dout)
+ );
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_y
+ (
+ .clk (clk),
+
+ .a_addr (pad_y_wr_addr),
+ .a_en (pad_y_wr_ena),
+ .a_wr (pad_y_wr_ena),
+ .a_in (pad_y_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_y_rd_addr),
+ .b_en (pad_y_rd_ena),
+ .b_out (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_addr <= load_xy_addr_lsb;
+ pad_y_wr_addr <= load_xy_addr_lsb;
+ end
+ //
+ default: begin
+ pad_x_wr_addr <= {INDEX_WIDTH{1'bX}};
+ pad_y_wr_addr <= {INDEX_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ default: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_ena <= 1'b1;
+ pad_y_wr_ena <= 1'b1;
+ end
+ //
+ default: begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v
new file mode 100644
index 0000000..a8f309a
--- /dev/null
+++ b/rtl/modexpng_mmm_transporter.v
@@ -0,0 +1,157 @@
+module modexpng_mmm_transporter
+(
+ clk,
+ ena,
+ index_last,
+ fsm_state,
+ fsm_state_next,
+ load_phase,
+ load_xy_addr,
+ load_xy_addr_vld,
+ load_xy_req,
+ load_addr_zero,
+ load_t1t2_addr_done,
+ load_nn_coeff_addr_done
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ena;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output load_phase;
+ output [ INDEX_WIDTH:0] load_xy_addr;
+ output load_xy_addr_vld;
+ output load_xy_req;
+ output load_addr_zero;
+ output load_t1t2_addr_done;
+ output load_nn_coeff_addr_done;
+
+
+ //
+ // Load Address Generator
+ //
+ reg load_phase_reg;
+ reg [INDEX_WIDTH:0] load_xy_addr_reg;
+ reg load_xy_addr_vld_reg;
+ reg load_xy_req_reg;
+
+
+ //
+ // Mapping
+ //
+ assign load_phase = load_phase_reg;
+ assign load_xy_addr = load_xy_addr_reg;
+ assign load_xy_addr_vld = load_xy_addr_vld_reg;
+ assign load_xy_req = load_xy_req_reg;
+
+
+ //
+ // Handy Quantities
+ //
+ wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0};
+ wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1;
+ wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX};
+
+
+ //
+ // More Handy Quantities
+ //
+ reg [INDEX_WIDTH:0] load_t1t2_addr_last;
+ reg [INDEX_WIDTH:0] load_nn_coeff_addr_last;
+
+
+ //
+ // Flags
+ //
+ assign load_addr_zero = load_xy_addr_reg == load_xy_addr_zero;
+ assign load_t1t2_addr_done = load_xy_addr_reg == load_t1t2_addr_last;
+ assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last;
+
+
+ //
+ // Last Index Latch
+ //
+ always @(posedge clk)
+ //
+ if (ena && (fsm_state == FSM_STATE_IDLE)) begin
+ load_t1t2_addr_last <= {1'b0, index_last};
+ load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1;
+ end
+
+
+ //
+ // Update Load Phase
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_phase_reg <= 1'b0;
+ FSM_STATE_LOAD_NN_COEFF_1,
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_phase_reg <= 1'b1;
+ default: load_phase_reg <= 1'bX;
+ endcase
+
+
+ //
+ // Update Load Address
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_xy_addr_reg <= load_xy_addr_reg;
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg;
+ default load_xy_addr_reg <= load_xy_addr_xxx;
+ endcase
+
+
+ //
+ // Update Address Valid Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1;
+ default load_xy_addr_vld_reg <= 1'b0;
+ endcase
+
+
+ //
+ // Update Load Request Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1;
+ default load_xy_req_reg <= 1'b0;
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v
new file mode 100644
index 0000000..99a37fa
--- /dev/null
+++ b/rtl/modexpng_mmm_x8_dual.v
@@ -0,0 +1,550 @@
+module modexpng_mmm_x8_dual
+(
+ clk, rst_n,
+ ena, rdy,
+ mode, transfer,
+ index_last,
+ x_din, y_din, x_dout, y_dout,
+ x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
+ x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena,
+ x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
+ load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+
+ input ena;
+ output rdy;
+
+ input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
+ // load/unload: 0 = load, 1 = unload
+ input transfer; // 0 = multiply, 1 = load/unload
+
+ input [INDEX_WIDTH-1:0] index_last;
+
+ input [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+ input [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+ output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+ output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+ output [INDEX_WIDTH-4:0] x_din_addr;
+ output [INDEX_WIDTH-4:0] y_din_addr;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [ 1-1:0] x_din_ena;
+ output [ 1-1:0] y_din_ena;
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+ output [ 1-1:0] x_din_reg_ena;
+ output [ 1-1:0] y_din_reg_ena;
+
+ output [3-1:0] x_din_bank;
+ output [3-1:0] y_din_bank;
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+ output load_phase; // 0 = T1, T2; 1 = N, N_COEFF
+ output [ INDEX_WIDTH:0] load_xy_addr; // address
+ output load_xy_addr_vld; // address valid
+ output load_xy_req; // data request
+
+ input [WORD_WIDTH-1:0] load_x_din; // data input
+ input [WORD_WIDTH-1:0] load_y_din; // data input
+
+
+ //
+ // FSM State and Next States
+ //
+ reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+
+
+ //
+ // FSM Idle Next State
+ //
+ always @*
+ //
+ case ({transfer, mode})
+ 2'b00,
+ 2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
+ 2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
+ 2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
+ endcase
+
+
+ //
+ // Column Counter
+ //
+ wire [ INDEX_WIDTH-4:0] col_index;
+ wire col_index_done;
+ wire [ INDEX_WIDTH-4:0] col_index_zero;
+ wire [ INDEX_WIDTH-4:0] col_index_next;
+ wire [ INDEX_WIDTH-4:0] col_index_prev;
+
+ modexpng_mmm_col_index #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ mmm_col_index
+ (
+ .clk (clk),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index (col_index),
+ .col_index_done (col_index_done),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .col_index_prev (col_index_prev)
+ );
+
+
+ //
+ // Load Address Generator
+ //
+ wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
+ wire load_addr_zero;
+ wire load_t1t2_addr_done;
+ wire load_nn_coeff_addr_done;
+
+ modexpng_mmm_transporter #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ transporter
+ (
+ .clk (clk),
+ .ena (ena),
+ .index_last (index_last),
+ .fsm_state (fsm_state),
+ .fsm_state_next (fsm_state_next),
+ .load_phase (load_phase),
+ .load_xy_addr (load_xy_addr),
+ .load_xy_addr_vld (load_xy_addr_vld),
+ .load_xy_req (load_xy_req),
+ .load_addr_zero (load_addr_zero),
+ .load_t1t2_addr_done (load_t1t2_addr_done),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done)
+ );
+
+
+ //
+ // X, Y Address
+ //
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
+ wire [ 3-1:0] x_din_addr_cnt_lower_prev;
+ wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
+
+ modexpng_mmm_din_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ din_addr_x
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .din_addr (x_din_addr),
+ .din_bank (x_din_bank),
+ .din_ena (x_din_ena),
+ .din_reg_ena (x_din_reg_ena),
+ .din_addr_cnt (x_din_addr_cnt),
+ .din_addr_cnt_last (x_din_addr_cnt_last),
+ .din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev),
+ .din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev)
+ );
+
+ modexpng_mmm_dout_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ dout_addr_xy
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr (load_xy_addr),
+ .load_addr_zero (load_addr_zero),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done),
+ .x_dout_addr (x_dout_addr),
+ .y_dout_addr (y_dout_addr),
+ .x_dout_ena (x_dout_ena),
+ .y_dout_ena (y_dout_ena),
+ .x_dout_bank (x_dout_bank),
+ .y_dout_bank (y_dout_bank)
+ );
+
+
+ //
+ // Helper Memories ("Scratchpad")
+ //
+ reg [INDEX_WIDTH-1:0] pad_xy_rd_addr;
+ reg pad_xy_rd_ena = 1'b0;
+ wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
+ wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
+
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
+
+ modexpng_mmm_pad pad
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr_lsb (load_xy_addr_lsb),
+ .load_x_din (load_x_din),
+ .load_y_din (load_y_din),
+ .pad_x_rd_addr (pad_xy_rd_addr),
+ .pad_y_rd_addr (pad_xy_rd_addr),
+ .pad_x_rd_ena (pad_xy_rd_ena),
+ .pad_y_rd_ena (pad_xy_rd_ena),
+ .pad_x_rd_dout (pad_x_rd_dout),
+ .pad_y_rd_dout (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_xy_rd_ena <= 1'b0;
+ end else case (fsm_state_next)
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_ena <= 1'b1;
+
+ default:
+ pad_xy_rd_ena <= 1'b0;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ pad_xy_rd_addr <= pad_xy_rd_addr_zero;
+
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_addr <= pad_xy_rd_addr_next;
+
+ default:
+ pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
+
+ endcase
+
+
+
+
+ //
+ // Flags
+ //
+
+ wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
+
+ always @*
+ //
+ fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
+
+
+ //
+ // MAC Arrays
+ //
+ reg mac_x_ce = 1'b0;
+ reg mac_x_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_x_clr;
+ reg mac_x_clr_aux;
+ reg [NUM_MULTS -2:0] mac_x_casc_a;
+ reg mac_x_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux;
+
+ reg mac_y_ce = 1'b0;
+ reg mac_y_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_y_clr;
+ reg mac_y_clr_aux;
+ reg [NUM_MULTS -2:0] mac_y_casc_a;
+ reg mac_y_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux;
+
+ modexpng_mac_array mac_array_x
+ (
+ .clk (clk),
+ .ce (mac_x_ce),
+ .ce_aux (mac_x_ce_aux),
+ .clr (mac_x_clr),
+ .clr_aux (mac_x_clr_aux),
+ .casc_a (mac_x_casc_a),
+ .casc_a_aux (mac_x_casc_a_aux),
+ .a_in (mac_x_a),
+ .a_in_aux (mac_x_a_aux),
+ .b_in (mac_x_b),
+ .p_out (mac_x_p),
+ .p_out_aux (mac_x_p_aux)
+ );
+
+ modexpng_mac_array mac_array_y
+ (
+ .clk (clk),
+ .ce (mac_y_ce),
+ .ce_aux (mac_y_ce_aux),
+ .clr (mac_y_clr),
+ .clr_aux (mac_y_clr_aux),
+ .casc_a (mac_y_casc_a),
+ .casc_a_aux (mac_y_casc_a_aux),
+ .a_in (mac_y_a),
+ .a_in_aux (mac_y_a_aux),
+ .b_in (mac_y_b),
+ .p_out (mac_y_p),
+ .p_out_aux (mac_y_p_aux)
+ );
+
+ genvar gen_z;
+
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_din
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ //gen_xy_dout
+ assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
+
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ reg mac_xy_ce_adv = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
+ default: mac_xy_ce_adv <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
+ else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
+
+
+ //
+ // MAC Clear Logic
+ //
+ wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
+ calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
+
+ reg [NUM_MULTS-1:0] mac_xy_clr_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
+ default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
+
+
+ //
+ // MAC Cascade Logic
+ //
+ reg [NUM_MULTS-2:0] mac_xy_casc_a_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
+ default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
+
+
+
+ //
+ // DOUT Mapping
+ //
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_dout
+ assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // DOUT
+ //
+ reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
+ reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
+
+
+
+
+ integer int_z;
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= load_x_din;
+ y_dout_reg[int_z] <= load_y_din;
+ end
+ //
+ default:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+
+
+ //
+ // FSM Process
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_state_next;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @* begin
+ //
+ fsm_state_next = FSM_STATE_IDLE;
+ //
+ case (fsm_state)
+ FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE;
+
+ FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ;
+ FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ;
+ FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1;
+
+ FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
+ FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
+ FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1;
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+
+ /*
+ FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ;
+ FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY;
+ FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ;
+ FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY;
+
+ FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ;
+ FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
+ FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ;
+ FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY;
+ */
+
+ FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ;
+
+ endcase
+ //
+ end
+
+
+ //
+ // Ready Output
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else case (fsm_state)
+ FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
+ FSM_STATE_STOP: rdy_reg <= 1'b1;
+ endcase
+
+ function [ NUM_MULTS-1:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] col_index_delayed;
+ input [ 3-1:0] x_din_addr_cnt_lower_delayed;
+ input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
+ begin
+ if (x_din_addr_cnt_upper_delayed == col_index_delayed)
+ case (x_din_addr_cnt_lower_delayed)
+ 3'b000: calc_mac_clear_square = 8'b00000001;
+ 3'b001: calc_mac_clear_square = 8'b00000010;
+ 3'b010: calc_mac_clear_square = 8'b00000100;
+ 3'b011: calc_mac_clear_square = 8'b00001000;
+ 3'b100: calc_mac_clear_square = 8'b00010000;
+ 3'b101: calc_mac_clear_square = 8'b00100000;
+ 3'b110: calc_mac_clear_square = 8'b01000000;
+ 3'b111: calc_mac_clear_square = 8'b10000000;
+ endcase
+ else
+ calc_mac_clear_square = {NUM_MULTS{1'b0}};
+ end
+ endfunction
+
+
+endmodule
diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh
new file mode 100644
index 0000000..f846119
--- /dev/null
+++ b/rtl/modexpng_parameters.vh
@@ -0,0 +1,39 @@
+//localparam WORD_WIDTH = 17;
+//localparam MAC_WIDTH = 47;
+
+//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere!
+
+localparam [2:0] BANK_FAT_T1T2 = 3'd0;
+localparam [2:0] BANK_FAT_ABL = 3'd1;
+localparam [2:0] BANK_FAT_ABH = 3'd2;
+localparam [2:0] BANK_FAT_Q = 3'd3;
+localparam [2:0] BANK_FAT_Q_EXT = 3'd4;
+localparam [2:0] BANK_FAT_ML = 3'd5;
+localparam [2:0] BANK_FAT_MH = 3'd6;
+localparam [2:0] BANK_FAT_MH_EXT = 3'd7;
+
+localparam [1:0] BANK_SLIM_T1T2 = 2'd0;
+localparam [1:0] BANK_SLIM_N = 2'd1;
+localparam [1:0] BANK_SLIM_N_COEFF = 2'd2;
+localparam [1:0] BANK_SLIM_N_COEFF_EXT = 2'd3;
+
+
+//localparam BANK_Y_T2 = 3'd0;
+//localparam BANK_XY_T1T2 = 3'd0;
+
+//localparam BANK_XY_AB_LSB = 3'd1;
+//localparam BANK_XY_AB_MSB = 3'd2;
+
+//localparam BANK_X_N = 3'd3;
+//localparam BANK_Y_N_COEFF = 3'd3;
+
+//localparam BANK_XY_M = 3'd4;
+
+//localparam BANK_XY_Q_LSB = 3'd5;
+//localparam BANK_XY_Q_MSB = 3'd6;
+
+//localparam BANK_XY_AUX = 3'd7;
+
+//localparam BANK_XY_ANY = 3'bXXX;
+
+//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8.vh
new file mode 100644
index 0000000..8734354
--- /dev/null
+++ b/rtl/modexpng_parameters_x8.vh
@@ -0,0 +1 @@
+localparam NUM_MULTS = 8;
diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v
new file mode 100644
index 0000000..db4774b
--- /dev/null
+++ b/rtl/modexpng_part_recombinator.v
@@ -0,0 +1,623 @@
+module modexpng_part_recombinator
+(
+ clk,
+ rdy,
+ fsm_state_next,
+ index_last,
+ dsp_x_ce_p, dsp_y_ce_p,
+ ena_x, ena_y,
+ dsp_x_p, dsp_y_p,
+ col_index, col_index_last, slim_bram_xy_addr,
+ fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid
+);
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_mmm_fsm.vh"
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+
+
+ input clk;
+ output rdy;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [7:0] index_last;
+ input dsp_x_ce_p;
+ input dsp_y_ce_p;
+ input ena_x;
+ input ena_y;
+ input [8*47-1:0] dsp_x_p;
+ input [8*47-1:0] dsp_y_p;
+ input [ 4:0] col_index;
+ input [ 4:0] col_index_last;
+ input [ 7:0] slim_bram_xy_addr;
+
+ output [ 2:0] fat_bram_xy_bank;
+ output [ 7:0] fat_bram_xy_addr;
+ output [ 17:0] fat_bram_x_dout;
+ output [ 17:0] fat_bram_y_dout;
+ output fat_bram_xy_dout_valid;
+
+
+ //
+ // Latches
+ //
+ reg [1*47-1:0] dsp_x_p_latch[0:7];
+ reg [1*47-1:0] dsp_y_p_latch[0:7];
+
+
+ //
+ // Mapping
+ //
+ wire [46:0] dsp_x_p_split[0:7];
+ wire [46:0] dsp_y_p_split[0:7];
+
+ genvar z;
+ generate for (z=0; z<NUM_MULTS; z=z+1)
+ begin : gen_dsp_xy_p_split
+ assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
+ assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
+ end
+ endgenerate
+
+
+ //
+ // Delays
+ //
+ reg dsp_y_ce_p_dly1 = 1'b0;
+ reg dsp_x_ce_p_dly1 = 1'b0;
+
+ always @(posedge clk) begin
+ //
+ {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p, dsp_x_ce_p};
+ //
+ end
+
+
+ //
+ // Registers
+ //
+
+ // valid
+ reg x_valid_lsb = 1'b0;
+ reg y_valid_lsb = 1'b0;
+ reg x_valid_msb = 1'b0;
+ reg y_valid_msb = 1'b0;
+
+ // bitmap
+ reg [7:0] x_bitmap_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_msb = {8{1'b0}};
+
+ // index
+ reg [2:0] x_index_lsb = 3'dX;
+ reg [2:0] y_index_lsb = 3'dX;
+
+ // purge
+ reg x_purge_lsb = 1'b0;
+ reg y_purge_lsb = 1'b0;
+ reg x_purge_msb = 1'b0;
+ reg y_purge_msb = 1'b0;
+
+ // valid - latch
+ reg x_valid_latch_lsb = 1'b0;
+ reg y_valid_latch_lsb = 1'b0;
+
+ // bitmap - latch
+ reg [7:0] x_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_latch_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_msb = {8{1'b0}};
+
+ // index - latch
+ reg [2:0] x_index_latch_lsb = 3'dX;
+ reg [2:0] y_index_latch_lsb = 3'dX;
+
+ // purge - index
+ reg x_purge_latch_lsb = 1'b0;
+ reg y_purge_latch_lsb = 1'b0;
+ reg x_purge_latch_msb = 1'b0;
+ reg y_purge_latch_msb = 1'b0;
+
+ //
+ reg xy_valid_lsb_adv[1:6];
+ reg xy_valid_msb_adv[1:6];
+ reg [7:0] xy_bitmap_lsb_adv[1:6];
+ reg [7:0] xy_bitmap_msb_adv[1:6];
+ reg [2:0] xy_index_lsb_adv[1:6];
+ reg [2:0] xy_index_msb_adv[1:6];
+ reg xy_purge_lsb_adv[1:6];
+ reg xy_purge_msb_adv[1:6];
+
+
+ integer i;
+ initial for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv[i] = 1'b0;
+ xy_valid_msb_adv[i] = 1'b0;
+ xy_bitmap_lsb_adv[i] = {8{1'b0}};
+ xy_bitmap_msb_adv[i] = {8{1'b0}};
+ xy_index_lsb_adv[i] = 3'dX;
+ xy_index_msb_adv[i] = 3'dX;
+ xy_purge_lsb_adv[i] = 1'b0;
+ xy_purge_msb_adv[i] = 1'b0;
+ end
+
+ function [0:0] calc_square_valid_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_valid_lsb = 1'b1;
+ else
+ calc_square_valid_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_bitmap_lsb = 8'b00000001;
+ 3'b001: calc_square_bitmap_lsb = 8'b00000010;
+ 3'b010: calc_square_bitmap_lsb = 8'b00000100;
+ 3'b011: calc_square_bitmap_lsb = 8'b00001000;
+ 3'b100: calc_square_bitmap_lsb = 8'b00010000;
+ 3'b101: calc_square_bitmap_lsb = 8'b00100000;
+ 3'b110: calc_square_bitmap_lsb = 8'b01000000;
+ 3'b111: calc_square_bitmap_lsb = 8'b10000000;
+ endcase
+ //
+ else
+ calc_square_bitmap_lsb = {8{1'b0}};
+ //
+ end
+ endfunction
+
+ function [2:0] calc_square_index_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_index_lsb = 3'd0;
+ 3'b001: calc_square_index_lsb = 3'd1;
+ 3'b010: calc_square_index_lsb = 3'd2;
+ 3'b011: calc_square_index_lsb = 3'd3;
+ 3'b100: calc_square_index_lsb = 3'd4;
+ 3'b101: calc_square_index_lsb = 3'd5;
+ 3'b110: calc_square_index_lsb = 3'd6;
+ 3'b111: calc_square_index_lsb = 3'd7;
+ endcase
+ //
+ else
+ calc_square_index_lsb = 3'dX;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
+ else
+ calc_square_purge_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function calc_square_valid_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_valid_msb = 1'b1;
+ else
+ calc_square_valid_msb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value) begin
+ calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
+ calc_square_bitmap_msb[6:0] = 7'b1111111;
+ end else
+ calc_square_bitmap_msb[7:0] = 8'b00000000;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_purge_msb = col_index_value == col_index_last_value;
+ else
+ calc_square_purge_msb = 1'b0;
+ //
+ end
+ endfunction
+
+
+ reg recomb_lsb_ce = 1'b0;
+ reg [ 2:0] recomb_lsb_ce_purge = 3'b000;
+ wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0];
+ reg recomb_lsb_clr;
+ reg recomb_lsb_vld = 1'b0;
+
+ reg [46:0] recomb_lsb_din;
+ wire [15:0] recomb_lsb_dout;
+
+ reg recomb_msb_ce = 1'b0;
+ reg [ 1:0] recomb_msb_ce_purge = 2'b00;
+ wire recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0];
+ reg recomb_msb_clr;
+ reg recomb_msb_vld = 1'b0;
+
+ always @(posedge clk)
+ //
+ {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined};
+
+ reg [46:0] recomb_msb_din;
+ wire [15:0] recomb_msb_dout;
+
+ modexpng_recombinator_block recomb_x_lsb
+ (
+ .clk (clk),
+ .ce (recomb_lsb_ce_combined),
+ .clr (recomb_lsb_clr),
+ .din (recomb_lsb_din),
+ .dout (recomb_lsb_dout)
+ );
+
+ modexpng_recombinator_block recomb_x_msb
+ (
+ .clk (clk),
+ .ce (recomb_msb_ce_combined),
+ .clr (recomb_msb_clr),
+ .din (recomb_msb_din),
+ .dout (recomb_msb_dout)
+ );
+
+ always @(posedge clk) begin
+ //
+ recomb_lsb_ce <= x_valid_latch_lsb;
+ recomb_msb_ce <= x_bitmap_latch_msb[0];
+ //
+ if (x_purge_latch_lsb)
+ recomb_lsb_ce_purge <= 3'b111;
+ else
+ recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]};
+ //
+ if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1])
+ recomb_msb_ce_purge = 2'b11;
+ else
+ recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]};
+ //
+ end
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ recomb_lsb_clr <= 1'b1;
+ recomb_msb_clr <= 1'b1;
+ end else begin
+ if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0;
+ if (recomb_msb_ce) recomb_msb_clr <= 1'b0;
+ end
+
+ always @(posedge clk)
+ //
+ if (x_valid_latch_lsb)
+ recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb];
+ else
+ recomb_lsb_din <= {47{1'b0}};
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[0])
+ recomb_msb_din <= dsp_x_p_latch[0];
+ else
+ recomb_msb_din <= {47{1'b0}};
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ //
+ xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr);
+ xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ //
+ xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ //
+ end
+ //
+ default: begin
+ //
+ xy_valid_lsb_adv [6] <= 1'b0;
+ xy_bitmap_lsb_adv[6] <= {8{1'b0}};
+ xy_index_lsb_adv [6] <= 3'dX;
+ xy_purge_lsb_adv [6] <= 1'b0;
+ //
+ xy_valid_msb_adv [6] <= 1'b0;
+ xy_bitmap_msb_adv[6] <= {8{1'b0}};
+ xy_purge_msb_adv [6] <= 1'b0;
+ //
+ end
+ //
+ endcase
+
+
+ always @(posedge clk) begin
+ //
+ {y_valid_lsb, x_valid_lsb} <= {2{xy_valid_lsb_adv [1]}};
+ {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}};
+ {y_index_lsb, x_index_lsb} <= {2{xy_index_lsb_adv [1]}};
+ {y_purge_lsb, x_purge_lsb} <= {2{xy_purge_lsb_adv [1]}};
+ //
+ {y_valid_latch_lsb, x_valid_latch_lsb} <= {y_valid_lsb, x_valid_lsb};
+ {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb};
+ {y_index_latch_lsb, x_index_latch_lsb} <= {y_index_lsb, x_index_lsb};
+ {y_purge_latch_lsb, x_purge_latch_lsb} <= {y_purge_lsb, x_purge_lsb};
+ //
+ {y_valid_msb, x_valid_msb} <= {2{xy_valid_msb_adv[1]}};
+ {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}};
+ {y_purge_msb, x_purge_msb} <= {2{xy_purge_msb_adv[1]}};
+ //
+ if (x_valid_msb) begin
+ x_bitmap_latch_msb <= x_bitmap_msb;
+ x_purge_latch_msb <= x_purge_msb;
+ end else begin
+ x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]};
+ end
+ //
+ //
+ for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
+ xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
+ xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
+ xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
+ //
+ xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
+ xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
+ xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
+ end
+ //
+ end
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[1]) // only shift 7 times
+ //
+ for (i=0; i<8; i=i+1)
+ if (i < 7)
+ dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
+ else
+ dsp_x_p_latch[i] <= {47{1'bX}};
+ //
+ else if (dsp_x_ce_p_dly1)
+ //
+ for (i=0; i<8; i=i+1)
+ //
+ if (x_bitmap_lsb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+ else if (x_valid_msb && x_bitmap_msb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+
+ reg recomb_x_lsb_dout_valid = 1'b0;
+ reg recomb_x_msb_dout_valid = 1'b0;
+
+ always @(posedge clk) begin
+ recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined;
+ recomb_x_msb_dout_valid <= recomb_msb_ce_combined;
+ end
+
+
+
+ reg [ 2:0] fat_bram_xy_bank_reg;
+ reg [ 7:0] fat_bram_xy_addr_reg;
+ reg [ 7:0] fat_bram_xy_cnt_lsb;
+ reg [ 7:0] fat_bram_xy_cnt_msb;
+ reg [17:0] fat_bram_x_dout_reg;
+ reg [17:0] fat_bram_y_dout_reg;
+ reg fat_bram_xy_dout_valid_reg = 1'b0;
+
+ reg [15:0] recomb_msb_dout_carry_0;
+ reg [15:0] recomb_msb_dout_carry_1;
+
+ reg [15:0] recomb_msb_dout_delay_0;
+ reg [15:0] recomb_msb_dout_delay_1;
+ reg [15:0] recomb_msb_dout_delay_2;
+
+ reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0;
+
+ assign fat_bram_xy_bank = fat_bram_xy_bank_reg;
+ assign fat_bram_xy_addr = fat_bram_xy_addr_reg;
+ assign fat_bram_x_dout = fat_bram_x_dout_reg;
+ assign fat_bram_y_dout = fat_bram_y_dout_reg;
+ assign fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg;
+
+ reg rdy_reg = 1'b1;
+ reg rdy_adv = 1'b1;
+
+ assign rdy = rdy_reg;
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y)
+ rdy_reg <= 1'b0;
+ else
+ rdy_reg <= rdy_adv;
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ rdy_adv <= 1'b0;
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ fat_bram_xy_cnt_msb <= 8'd0;
+ end else begin
+ //
+ case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid})
+ //
+ 2'b00: begin
+ //
+ if (recomb_msb_cnt_delay_2 > 8'd0) begin
+ //
+ rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0;
+ //
+ recomb_msb_dout_delay_0 <= {18{1'bX}};
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= 8'd0;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2;
+ fat_bram_x_dout_reg <= recomb_msb_dout_delay_2;
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end
+ //
+ end
+ //
+ 2'b01: begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ end
+ //
+ 2'b10: begin
+ //
+ if (fat_bram_xy_cnt_msb < 8'd2) begin
+ //
+ recomb_msb_dout_carry_0 <= recomb_msb_dout;
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ 2'b11: begin
+ //
+ if (fat_bram_xy_cnt_lsb == index_last) begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ recomb_msb_dout_carry_0 <= {16{1'bX}};
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ end
+ //
+ recomb_msb_dout_delay_0 <= recomb_msb_dout;
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ endcase
+ //
+ end
+
+
+
+
+endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
new file mode 100644
index 0000000..efe0ac5
--- /dev/null
+++ b/rtl/modexpng_recombinator_block.v
@@ -0,0 +1,35 @@
+module modexpng_recombinator_block
+(
+ clk,
+ ce, clr,
+ din, dout
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input [46:0] din;
+ output [15:0] dout;
+
+ reg [14:0] z;
+ reg [16:0] y;
+ reg [17:0] x;
+ //reg [15:0] w;
+
+ //assign dout = w;
+ assign dout = x[15:0];
+
+ wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here
+ wire [15:0] din_y = din[31:16];
+ wire [15:0] din_x = din[15: 0];
+
+ always @(posedge clk)
+ //
+ if (ce) begin
+ z <= din_z;
+ y <= clr ? {1'b0, din_y} : {1'b0, din_y} + {2'b00, z};
+ x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]};
+ //w <= clr ? {16{1'bX}} : x[15:0];
+ end
+
+endmodule
More information about the Commits
mailing list