[Cryptech-Commits] [core/math/modexpa7] 03/06: Added systolic modular multiplier w/ testbench. * works in simulator * may have to change how internal operand buffer is pre-loaded (shift register instead of wide mux?) * code needs some cleanup

git at cryptech.is git at cryptech.is
Tue Jun 27 10:52:09 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

commit 0b873507ad47e3046935dfc8b3f91d36bc21c7b0
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Jun 27 13:44:08 2017 +0300

    Added systolic modular multiplier w/ testbench.
     * works in simulator
     * may have to change how internal operand buffer is pre-loaded
       (shift register instead of wide mux?)
     * code needs some cleanup
---
 src/rtl/modexpa7_systolic_multiplier.v | 876 +++++++++++++++++++++++++++++++++
 src/rtl/util/bram_1rw_1ro_readfirst.v  |  88 ++++
 src/rtl/util/bram_1rw_readfirst.v      |  75 +++
 src/tb/tb_systolic_multiplier.v        | 545 ++++++++++++++++++++
 4 files changed, 1584 insertions(+)

diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
new file mode 100644
index 0000000..0849b61
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -0,0 +1,876 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+	(
+			//
+			// This sets the address widths of memory buffers. Internal data
+			// width is 32 bits, so for e.g. 1024-bit operands buffers must store
+			// 1024 / 32 = 32 words, and these need 5-bit address bus, because
+			// 2 ** 5 = 32.
+			//
+		parameter	OPERAND_ADDR_WIDTH		= 5,
+		
+			//
+			// This sets the width of the systolic cycle counter. TODO: Explain.
+			//
+		parameter	SYSTOLIC_ARRAY_POWER		= 3
+	)
+	(
+		input											clk,
+		input											rst_n,
+
+		input											ena,
+		output										rdy,
+
+		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,
+
+		input		[                32-1:0]	a_bram_out,
+		input		[                32-1:0]	b_bram_out,
+		input		[                32-1:0]	n_bram_out,
+		input		[                32-1:0]	n_coeff_bram_out,
+
+		output	[                32-1:0]	r_bram_in,
+		output										r_bram_wr,
+
+		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
+	);
+	
+	
+		//
+		// Constants
+		//
+	localparam	SYSTOLIC_CNTR_WIDTH		= OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+	localparam	SYSTOLIC_ARRAY_LENGTH	= 2 ** SYSTOLIC_ARRAY_POWER;
+	localparam	SYSTOLIC_NUM_CYCLES		= 2 ** SYSTOLIC_CNTR_WIDTH;
+
+	localparam	SYSTOLIC_PE_LATENCY		= 4;
+	
+
+		//
+		// FSM Declaration
+		//
+	localparam	[ 3: 0]	FSM_STATE_IDLE					= 4'd0;
+	localparam	[ 3: 0]	FSM_STATE_INIT_ZERO_ADDR	= 4'd1;
+	localparam	[ 3: 0]	FSM_STATE_INIT_NEXT_ADDR	= 4'd2;
+	localparam	[ 3: 0]	FSM_STATE_INIT_LAST_ADDR	= 4'd3;
+	localparam	[ 3: 0]	FSM_STATE_PIPE_CRUNCH		= 4'd4;
+	localparam	[ 3: 0]	FSM_STATE_PIPE_RELOAD		= 4'd5;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_ZERO_ADDR	= 4'd6;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_NEXT_ADDR	= 4'd7;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_LAST_ADDR	= 4'd8;
+	localparam	[ 3: 0]	FSM_STATE_STOP					= 4'd9;
+	
+	reg	[ 3: 0]	fsm_state = FSM_STATE_IDLE;
+	reg	[ 3: 0]	fsm_next_state;
+
+	
+		//
+		// Enable Delay (Trigger)
+		//
+   reg ena_dly = 1'b0;
+   wire ena_trig = ena && !ena_dly;
+   always @(posedge clk) ena_dly <= ena;		
+
+		
+		//
+		// Parameters Latch
+		//
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
+			n_num_words_latch <= n_num_words;
+
+
+		//
+		// Addresses
+		//
+	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
+	
+	
+		//
+		// BRAM Addresses
+		//
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr_reg;
+
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr = s_bram_addr_reg;
+	
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_dly;
+	
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_next       = b_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_next       = a_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_next       = n_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_next       = s_bram_addr + 1'b1;
+	
+	wire										b_bram_addr_done = 
+		(b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+	wire										s_bram_addr_done = 
+		(s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+	
+	assign b_bram_addr = b_bram_addr_reg;
+	assign a_bram_addr = a_bram_addr_reg;
+	assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
+	assign n_bram_addr = n_bram_addr_reg;
+	assign r_bram_addr = r_bram_addr_reg;
+
+	always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
+	always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
+	always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
+	always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+		
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	b_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	b_bram_addr_reg <= b_bram_addr_next;
+		endcase
+
+	always @(posedge clk)
+		case (fsm_next_state)
+			FSM_STATE_SAVE_ZERO_ADDR:	s_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_SAVE_NEXT_ADDR:	s_bram_addr_reg <= s_bram_addr_next;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_LAST_ADDR:	a_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_PIPE_RELOAD:		a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	n_coeff_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
+		endcase
+
+
+		
+		
+		//
+		// Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+		//
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+
+	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb;
+	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb;
+	
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb_next =
+		{pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb_next =
+		{pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+
+	wire										pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
+	wire										pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_lsb <= pe_latency_start;
+				FSM_STATE_PIPE_CRUNCH:		pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
+														pe_latency_ab_lsb : pe_latency_ab_lsb_next;
+			endcase
+
+		//
+		// Buffers
+		//
+	integer i, j;
+
+	reg	[31: 0]	b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						b_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						n_coeff_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						n_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+		endcase
+
+		
+	
+		
+	
+	
+		//
+		// Cycle Counters
+		//
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s;
+	
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_half = {1'b0, n_num_words};
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_last = {n_num_words, 1'b1};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+	wire										mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+	
+	wire										syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
+
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab_next = mult_cnt_ab + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q_next = mult_cnt_q + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn_next = mult_cnt_qn + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s_next = mult_cnt_s + 1'b1;
+	
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
+
+	
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		syst_cnt <= syst_cnt_zero;
+				FSM_STATE_PIPE_CRUNCH:		syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_ab <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_q <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_qn <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
+			endcase
+		
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_s <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
+			endcase
+		
+		
+	always @(posedge clk) begin
+		syst_cnt_dly[0] <= syst_cnt;
+		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+			syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+	end
+	
+		//
+		// Systolic Array
+		//
+	wire	[31: 0]	mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	wire	[31: 0]	mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	wire	[31: 0]	mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+	
+	wire	[31: 0]	mul_ab_a	= (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
+	reg	[31: 0]	mul_q_a_int;
+	reg	[31: 0]	mul_q_a;
+	reg	[31: 0]	mul_qn_a_int;
+	reg	[31: 0]	mul_qn_a;
+	
+	reg	[31: 0]	t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	reg	[31: 0]	t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	reg	[31: 0]	t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	genvar syst;
+	generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
+		begin : gen_mul
+	
+			/*modexpa7_*/pe_mul mul_ab_inst
+			(
+				.clk		(clk),
+				.a			(mul_ab_a),
+				.b			(b_buf[syst_cnt][syst]),
+				.t			(t_ab[syst_cnt][syst]),
+				.c_in		(c_ab_in[syst_cnt][syst]),
+				
+				.p			(mul_ab_p[syst]),
+				.c_out	(mul_ab_c_out[syst])
+			);
+			
+			/*modexpa7_*/pe_mul mul_q_inst
+			(
+				.clk		(clk),
+				.a			(mul_q_a),
+				.b			(n_coeff_buf[syst_cnt][syst]),
+				.t			(t_q[syst_cnt][syst]),
+				.c_in		(c_q_in[syst_cnt][syst]),
+				
+				.p			(mul_q_p[syst]),
+				.c_out	(mul_q_c_out[syst])
+			);
+			
+
+			/*modexpa7_*/pe_mul mul_qn_inst
+			(
+				.clk		(clk),
+				.a			(mul_qn_a),
+				.b			(n_buf[syst_cnt][syst]),
+				.t			(t_qn[syst_cnt][syst]),
+				.c_in		(c_qn_in[syst_cnt][syst]),
+				
+				.p			(mul_qn_p[syst]),
+				.c_out	(mul_qn_c_out[syst])
+			);
+			
+		end
+	endgenerate
+	
+		//
+		// c_ab
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_ab_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
+		endcase
+	
+		//
+		// c_q
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_q_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
+		endcase
+
+		//
+		// c_qn
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_qn_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
+		endcase
+		
+		//
+		// t_ab
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_ab[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
+				end
+				
+		endcase
+
+
+		//
+		// t_q
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_q[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
+				end
+				
+		endcase
+
+
+		//
+		// t_qn
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_qn[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
+				end
+				
+		endcase
+
+		//
+		// Latency 2
+		//
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_msb <= pe_latency_start;
+				FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)
+					pe_latency_ab_msb <= pe_latency_ab_msb_done ?
+														pe_latency_ab_msb : pe_latency_ab_msb_next;
+			endcase
+
+
+		//
+		// Adder
+		//
+	reg				pe_add_ce;
+	reg	[31: 0]	pe_add_a0;
+	reg	[31: 0]	pe_add_a1;
+	reg	[31: 0]	pe_add_a2;
+	reg	[31: 0]	pe_add_b0;
+
+	reg				pe_add_c_in;
+	wire	[31: 0]	pe_add_s;
+	wire				pe_add_c_out;
+
+	reg				pe_sub_ce;
+	reg	[31: 0]	pe_sub_a0;
+	reg	[31: 0]	pe_sub_b0;
+
+	reg				pe_sub_b_in;
+	wire	[31: 0]	pe_sub_d;
+	wire				pe_sub_b_out;
+	
+	always @(posedge clk)
+		pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
+
+	always @(posedge clk)
+		pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+
+	always @(posedge clk)
+		//
+		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
+			pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+
+	always @(posedge clk)
+		//
+		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
+			pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
+	
+	
+	modexpa7_pe_add pe_add_inst
+	(
+		.clk		(clk),
+		.ce		(pe_add_ce),
+		.a			(pe_add_a2),
+		.b			(pe_add_b0),
+		.c_in		(pe_add_c_in),
+		.s			(pe_add_s),
+		.c_out	(pe_add_c_out)
+	);
+
+	modexpa7_pe_sub pe_sub_inst
+	(
+		.clk		(clk),
+		.ce		(pe_sub_ce),
+		.a			(pe_sub_a0),
+		.b			(pe_sub_b0),
+		.b_in		(pe_sub_b_in),
+		.d			(pe_sub_d),
+		.b_out	(pe_sub_b_out)
+	);
+	
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
+			pe_add_a0 <= mul_ab_p[0];
+			pe_add_a1 <= pe_add_a0;
+			pe_add_a2 <= pe_add_a1;
+		end
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_sub_a0 <= pe_add_s;
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_add_b0 <= mul_qn_p[0];
+	
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
+	
+	
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	n_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	n_bram_addr_reg <= n_bram_addr_next;
+			FSM_STATE_PIPE_RELOAD: begin
+				if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
+				if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+			end
+		endcase
+		
+		
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+
+   always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	rdy_reg	<= 1'b1;
+		else begin
+			if (fsm_state == FSM_STATE_IDLE)		rdy_reg <= ~ena_trig;
+			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;
+		end
+	
+
+		//
+		//
+		//
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			mul_q_a_int <= mul_ab_p[0];
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			mul_qn_a_int <= mul_q_p[0];
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			mul_q_a <= mul_q_a_int;	// TODO: Add masking! Maybe not needed after all?..
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
+	
+		//
+		// Debug
+		//
+	//always @(posedge clk) begin
+		//
+		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			//$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
+		//
+		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			//$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
+		//
+		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			//$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
+		//
+		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			//$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
+		//
+	//end
+		
+		
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
+	reg										s_bram_en;
+	
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
+	reg										sn_bram_en;
+	
+	assign s_bram_addr_rd = s_bram_addr;
+	assign sn_bram_addr_rd = s_bram_addr;
+	
+	wire	[31: 0]	s_bram_din;
+	wire	[31: 0]	s_bram_dout;
+	
+	wire	[31: 0]	sn_bram_din;
+	wire	[31: 0]	sn_bram_dout;
+	
+	assign s_bram_din = pe_add_s;
+	assign sn_bram_din = pe_sub_d;
+	
+	always @(posedge clk)
+		//
+		s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
+
+	always @(posedge clk)
+		//
+		sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
+	
+	always @(posedge clk) begin
+		//
+		if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
+		if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
+	end
+
+	always @(posedge clk) begin
+		//
+		if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
+		if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
+	end
+	
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_s (.clk(clk),
+		.a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
+		.b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_sn (.clk(clk),
+		.a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
+		.b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
+		
+		
+	reg	r_bram_en;
+	
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_ZERO_ADDR,
+			FSM_STATE_SAVE_NEXT_ADDR:	r_bram_en <= 1'b1;
+			default:							r_bram_en <= 1'b0;
+			
+		endcase
+		
+		
+		
+	reg	r_bram_wr_reg;
+	
+	assign r_bram_wr = r_bram_wr_reg;
+	
+	always @(posedge clk)
+		//
+		r_bram_wr_reg <= r_bram_en;
+		
+		
+	wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
+		
+		
+	reg	[31: 0]	r_bram_in_reg;
+	
+	assign r_bram_in = r_bram_in_reg;
+
+		always @(posedge clk)
+			//
+			if (r_bram_en)
+				r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
+	
+	always @(posedge clk)
+		//
+		if (r_bram_en)
+			r_bram_addr_reg <= s_bram_addr_dly;
+	
+	
+		//
+		// FSM Transition Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
+		else						fsm_state <= fsm_next_state;
+	
+	always @* begin
+		//
+		fsm_next_state = FSM_STATE_STOP;
+		//
+		case (fsm_state)
+		
+			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+												else							fsm_next_state = FSM_STATE_IDLE;
+												
+			FSM_STATE_INIT_ZERO_ADDR:									fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+			
+			FSM_STATE_INIT_NEXT_ADDR:	if (b_bram_addr_done)	fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
+												else							fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+												
+			FSM_STATE_INIT_LAST_ADDR:									fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+			
+			FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)		fsm_next_state = pe_latency_ab_msb_done ?
+																					FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
+												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+			FSM_STATE_PIPE_RELOAD:		if (mult_cnt_s_done)		fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
+												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+												
+			FSM_STATE_SAVE_ZERO_ADDR:									fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+			
+			FSM_STATE_SAVE_NEXT_ADDR:	if (s_bram_addr_done)	fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
+												else							fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+			
+			FSM_STATE_SAVE_LAST_ADDR:									fsm_next_state = FSM_STATE_STOP;
+			
+			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
+			
+		endcase
+	end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================
diff --git a/src/rtl/util/bram_1rw_1ro_readfirst.v b/src/rtl/util/bram_1rw_1ro_readfirst.v
new file mode 100644
index 0000000..56cb24e
--- /dev/null
+++ b/src/rtl/util/bram_1rw_1ro_readfirst.v
@@ -0,0 +1,88 @@
+//======================================================================
+//
+// Copyright (c) 2015, 2017 NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_1ro_readfirst
+  #(parameter MEM_WIDTH            = 32,
+    parameter MEM_ADDR_BITS        = 8)
+   (
+    input wire                     clk,
+
+    input wire [MEM_ADDR_BITS-1:0] a_addr,
+    input wire                     a_wr,
+    input wire [MEM_WIDTH-1:0]     a_in,
+    output wire [MEM_WIDTH-1:0]    a_out,
+
+    input wire [MEM_ADDR_BITS-1:0] b_addr,
+    output wire [MEM_WIDTH-1:0]    b_out
+    );
+
+
+   //
+   // BRAM
+   //
+   (* RAM_STYLE="BLOCK" *)
+   reg [MEM_WIDTH-1:0]             bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+   //
+   // Output Registers
+   //
+   reg [MEM_WIDTH-1:0]             bram_reg_a;
+   reg [MEM_WIDTH-1:0]             bram_reg_b;
+
+   assign a_out = bram_reg_a;
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Write Port A
+   //
+   always @(posedge clk) begin
+      //
+      bram_reg_a <= bram[a_addr];
+      //
+      if (a_wr) bram[a_addr] <= a_in;
+      //
+   end
+
+
+   //
+   // Read-Only Port B
+   //
+   always @(posedge clk)
+     //
+     bram_reg_b <= bram[b_addr];
+
+
+endmodule
diff --git a/src/rtl/util/bram_1rw_readfirst.v b/src/rtl/util/bram_1rw_readfirst.v
new file mode 100644
index 0000000..30ecae8
--- /dev/null
+++ b/src/rtl/util/bram_1rw_readfirst.v
@@ -0,0 +1,75 @@
+//======================================================================
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_readfirst
+  #(parameter MEM_WIDTH            = 32,
+    parameter MEM_ADDR_BITS        = 8)
+   (
+    input wire                     clk,
+
+    input wire [MEM_ADDR_BITS-1:0] a_addr,
+    input wire                     a_wr,
+    input wire [MEM_WIDTH-1:0]     a_in,
+    output wire [MEM_WIDTH-1:0]    a_out
+    );
+
+
+   //
+   // BRAM
+   //
+   (* RAM_STYLE="BLOCK" *)
+   reg [MEM_WIDTH-1:0]             bram[0:(2**MEM_ADDR_BITS)-1];
+	
+	
+   //
+   // Output Register
+   //
+   reg [MEM_WIDTH-1:0]             bram_reg_a;
+
+   assign a_out = bram_reg_a;
+
+
+   //
+   // Read-Write Port A
+   //
+   always @(posedge clk) begin
+      //
+      bram_reg_a <= bram[a_addr];
+      //
+      if (a_wr) bram[a_addr] <= a_in;
+      //
+   end
+
+
+endmodule
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
new file mode 100644
index 0000000..3cbb8d1
--- /dev/null
+++ b/src/tb/tb_systolic_multiplier.v
@@ -0,0 +1,545 @@
+//======================================================================
+//
+// tb_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Testbench for systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module tb_systolic_multiplier;
+
+	
+		//
+		// Test Vectors
+		//
+	`include "../modexp_fpga_model_vectors.v";
+	
+	
+		//
+		// Parameters
+		//
+	localparam NUM_WORDS_384 = 384 / 32;
+	localparam NUM_WORDS_512 = 512 / 32;
+	
+	
+		//
+		// Model Settings
+		//
+	localparam NUM_ROUNDS = 10;
+	
+	
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+	
+	
+		//
+		// Inputs
+		//
+	reg				rst_n;
+	reg				ena;
+	
+	reg	[ 3: 0]	n_num_words;
+
+
+		//
+		// Outputs
+		//
+	wire	rdy;
+
+
+		//
+		// Integers
+		//
+	integer w;
+	
+	
+		//
+		// BRAM Interfaces
+		//
+	wire	[ 3: 0]	core_a_addr;
+	wire	[ 3: 0]	core_b_addr;
+	wire	[ 3: 0]	core_n_addr;
+	wire	[ 3: 0]	core_n_coeff_addr;
+	wire	[ 3: 0]	core_r_addr;
+	
+	wire	[31: 0]	core_a_data;
+	wire	[31: 0]	core_b_data;
+	wire	[31: 0]	core_n_data;
+	wire	[31: 0]	core_n_coeff_data;
+	wire	[31: 0]	core_r_data;
+
+	wire				core_r_wren;
+
+	reg	[ 3: 0]	tb_abn_addr;
+	reg	[ 3: 0]	tb_r_addr;
+
+	reg	[31:0]	tb_a_data;
+	reg	[31:0]	tb_b_data;
+	reg	[31:0]	tb_n_data;
+	reg	[31:0]	tb_n_coeff_data;
+	wire	[31:0]	tb_r_data;
+	
+	reg				tb_abn_wren;
+	
+
+		//
+		// BRAMs
+		//
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+	bram_a (.clk(clk),
+		.a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_a_data), .a_out(),
+		.b_addr(core_a_addr), .b_out(core_a_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+	bram_b (.clk(clk),
+		.a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_b_data), .a_out(),
+		.b_addr(core_b_addr), .b_out(core_b_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+	bram_n (.clk(clk),
+		.a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_data), .a_out(),
+		.b_addr(core_n_addr), .b_out(core_n_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+	bram_n_coeff (.clk(clk),
+		.a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_coeff_data), .a_out(),
+		.b_addr(core_n_coeff_addr), .b_out(core_n_coeff_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+	bram_r (.clk(clk),
+		.a_addr(core_r_addr), .a_wr(core_r_wren), .a_in(core_r_data), .a_out(),
+		.b_addr(tb_r_addr), .b_out(tb_r_data));
+		
+
+		//
+		// UUT
+		//
+	modexpa7_systolic_multiplier #
+	(
+		.OPERAND_ADDR_WIDTH		(4),	// 32 * (2**4) = 512-bit operands
+		.SYSTOLIC_ARRAY_POWER	(2)	// 2 ** 2 = 4-tap array
+	)
+	uut
+	(
+		.clk						(clk), 
+		.rst_n					(rst_n), 
+		
+		.ena						(ena), 
+		.rdy						(rdy), 
+		
+		.a_bram_addr			(core_a_addr), 
+		.b_bram_addr			(core_b_addr), 
+		.n_bram_addr			(core_n_addr), 
+		.n_coeff_bram_addr	(core_n_coeff_addr), 
+		.r_bram_addr			(core_r_addr), 
+
+		.a_bram_out				(core_a_data), 
+		.b_bram_out				(core_b_data), 
+		.n_bram_out				(core_n_data), 
+		.n_coeff_bram_out		(core_n_coeff_data), 
+		
+		.r_bram_in				(core_r_data), 
+		.r_bram_wr				(core_r_wren), 
+		
+		.n_num_words			(n_num_words)
+	);
+
+
+		//
+		// Script
+		//
+	initial begin
+
+		rst_n = 1'b0;
+		ena = 1'b0;
+		
+		#200;		
+		rst_n = 1'b1;
+		#100;
+		
+		test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384);
+		test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
+		
+	end
+      
+		
+		//
+		// Test Tasks
+		//
+	task test_systolic_multiplier_384;
+	
+		input	[383:0] m;
+		input	[383:0] n;
+		input	[383:0] n_coeff;
+		input	[383:0] factor;
+		input [383:0] coeff;
+		
+		reg	[767:0] m_factor_full;
+		reg	[383:0] m_factor_modulo;
+		
+		reg	[383:0] a;
+		reg	[383:0] b;
+		reg	[383:0] r;
+	
+		reg	[767:0] ab_full;
+		reg	[383:0] ab_modulo;
+				
+		integer			round;
+		integer			num_passed;
+		integer			num_failed;
+	
+		begin
+			
+			m_factor_full = m * factor;					// m * factor
+			m_factor_modulo = m_factor_full % n;		// m * factor % n
+																	
+			m_factor_full = m_factor_modulo * coeff;	// m * factor * coeff
+			m_factor_modulo = m_factor_full % n;		// m * factor * coeff % n
+			
+			a = m_factor_modulo;								// start with a = m_factor...
+			b = m_factor_modulo;								// ... and b = m_factor
+
+			n_num_words = 4'd11;								// set number of words
+	
+			num_passed = 0;									// nothing tested so far
+			num_failed = 0;									//
+		
+			for (round=0; round<NUM_ROUNDS; round=round+1) begin
+			
+					// obtain reference value of product
+				ab_full  			= a * b;						// calculate product
+				ab_modulo			= ab_full % n;				// reduce
+	
+				ab_full				= ab_modulo * coeff;		// take extra coefficient into account
+				ab_modulo			= ab_full % n;				// reduce again
+
+				write_memories_384(a, b, n, n_coeff);		// fill memories
+			
+				ena = 1;												// start operation
+				#10;													//
+				ena = 0;												// clear flag
+			
+				while (!rdy) #10;									// wait for operation to complete
+
+				read_memory_384(r);								// get result from memory
+								
+				$display("test_systolic_multiplier_384(): round #%0d of %0d", round+1, NUM_ROUNDS);
+				$display("    calculated: %x", r);
+				$display("    expected:   %x", ab_modulo);
+								
+					// check calculated value
+				if (r === ab_modulo) begin
+					$display("        OK");
+					num_passed = num_passed + 1;
+				end else begin
+					$display("        ERROR");
+					num_failed = num_failed + 1;
+				end
+
+				b = ab_modulo;										// prepare for next round
+
+			end		
+		
+				// final step, display results
+			if (num_passed == NUM_ROUNDS)
+				$display("SUCCESS: All tests passed.");
+			else
+				$display("FAILURE: %0d test(s) not passed.", num_failed);
+		
+		end
+		
+	endtask
+
+
+		//
+		// Test Tasks
+		//
+	task test_systolic_multiplier_512;
+	
+		input	[ 511:0] m;
+		input	[ 511:0] n;
+		input	[ 511:0] n_coeff;
+		input	[ 511:0] factor;
+		input [ 511:0] coeff;
+		
+		reg	[1023:0] m_factor_full;
+		reg	[ 511:0] m_factor_modulo;
+		
+		reg	[ 511:0] a;
+		reg	[ 511:0] b;
+		reg	[ 511:0] r;
+	
+		reg	[1023:0] ab_full;
+		reg	[ 511:0] ab_modulo;
+				
+		integer			round;
+		integer			num_passed;
+		integer			num_failed;
+	
+		begin
+			
+			m_factor_full = m * factor;					// m * factor
+			m_factor_modulo = m_factor_full % n;		// m * factor % n
+																	
+			m_factor_full = m_factor_modulo * coeff;	// m * factor * coeff
+			m_factor_modulo = m_factor_full % n;		// m * factor * coeff % n
+			
+			a = m_factor_modulo;								// start with a = m_factor...
+			b = m_factor_modulo;								// ... and b = m_factor
+
+			n_num_words = 4'd15;								// set number of words
+	
+			num_passed = 0;									// nothing tested so far
+			num_failed = 0;									//
+		
+			for (round=0; round<NUM_ROUNDS; round=round+1) begin
+			
+					// obtain reference value of product
+				ab_full  			= a * b;						// calculate product
+				ab_modulo			= ab_full % n;				// reduce
+	
+				ab_full				= ab_modulo * coeff;		// take extra coefficient into account
+				ab_modulo			= ab_full % n;				// reduce again
+
+				write_memories_512(a, b, n, n_coeff);		// fill memories
+			
+				ena = 1;												// start operation
+				#10;													//
+				ena = 0;												// clear flag
+			
+				while (!rdy) #10;									// wait for operation to complete
+
+				read_memory_512(r);								// get result from memory
+								
+				$display("test_systolic_multiplier_512(): round #%0d of %0d", round+1, NUM_ROUNDS);
+				$display("    calculated: %x", r);
+				$display("    expected:   %x", ab_modulo);
+								
+					// check calculated value
+				if (r === ab_modulo) begin
+					$display("        OK");
+					num_passed = num_passed + 1;
+				end else begin
+					$display("        ERROR");
+					num_failed = num_failed + 1;
+				end
+
+				b = ab_modulo;										// prepare for next round
+
+			end		
+		
+				// final step, display results
+			if (num_passed == NUM_ROUNDS)
+				$display("SUCCESS: All tests passed.");
+			else
+				$display("FAILURE: %0d test(s) not passed.", num_failed);
+		
+		end
+		
+	endtask
+	
+	
+		//
+		// BRAM Writer
+		//
+	task write_memories_384;
+
+		input	[383:0] a;
+		input	[383:0] b;
+		input	[383:0] n;
+		input	[383:0] n_coeff;
+		
+		reg	[383:0] a_shreg;
+		reg	[383:0] b_shreg;
+		reg	[383:0] n_shreg;
+		reg	[383:0] n_coeff_shreg;
+		
+		begin
+			
+			tb_abn_wren	= 1;														// start filling memories
+			
+			a_shreg       = a;													// initialize shift registers
+			b_shreg       = b;													//
+			n_shreg       = n;													//
+			n_coeff_shreg = n_coeff;											//
+			
+			for (w=0; w<NUM_WORDS_384; w=w+1) begin						// write all words
+				
+				tb_abn_addr	= w[3:0];											// set addresses
+				
+				tb_a_data       = a_shreg[31:0];								// set data words
+				tb_b_data       = b_shreg[31:0];								//
+				tb_n_data       = n_shreg[31:0];								//
+				tb_n_coeff_data = n_coeff_shreg[31:0];						//
+				
+				a_shreg       = {{32{1'bX}}, a_shreg[383:32]};			// shift inputs
+				b_shreg       = {{32{1'bX}}, b_shreg[383:32]};			//
+				n_shreg       = {{32{1'bX}}, n_shreg[383:32]};			//
+				n_coeff_shreg = {{32{1'bX}}, n_coeff_shreg[383:32]};	//
+				
+				#10;																	// wait for 1 clock tick
+				
+			end
+			
+			tb_abn_addr	= {4{1'bX}};											// wipe addresses
+			
+			tb_a_data       = {32{1'bX}};										// wipe data words
+			tb_b_data       = {32{1'bX}};										//
+			tb_n_data       = {32{1'bX}};										//
+			tb_n_coeff_data = {32{1'bX}};										//
+			
+			tb_abn_wren = 0;														// stop filling memories
+		
+		end
+		
+	endtask
+		
+		
+		//
+		// BRAM Writer
+		//
+	task write_memories_512;
+
+		input	[511:0] a;
+		input	[511:0] b;
+		input	[511:0] n;
+		input	[511:0] n_coeff;
+		
+		reg	[511:0] a_shreg;
+		reg	[511:0] b_shreg;
+		reg	[511:0] n_shreg;
+		reg	[511:0] n_coeff_shreg;
+		
+		begin
+			
+			tb_abn_wren	= 1;														// start filling memories
+			
+			a_shreg       = a;													// initialize shift registers
+			b_shreg       = b;													//
+			n_shreg       = n;													//
+			n_coeff_shreg = n_coeff;											//
+			
+			for (w=0; w<NUM_WORDS_512; w=w+1) begin						// write all words
+				
+				tb_abn_addr	= w[3:0];											// set addresses
+				
+				tb_a_data       = a_shreg[31:0];								// set data words
+				tb_b_data       = b_shreg[31:0];								//
+				tb_n_data       = n_shreg[31:0];								//
+				tb_n_coeff_data = n_coeff_shreg[31:0];						//
+				
+				a_shreg       = {{32{1'bX}}, a_shreg[511:32]};			// shift inputs
+				b_shreg       = {{32{1'bX}}, b_shreg[511:32]};			//
+				n_shreg       = {{32{1'bX}}, n_shreg[511:32]};			//
+				n_coeff_shreg = {{32{1'bX}}, n_coeff_shreg[511:32]};	//
+				
+				#10;																	// wait for 1 clock tick
+				
+			end
+			
+			tb_abn_addr	= {4{1'bX}};											// wipe addresses
+			
+			tb_a_data       = {32{1'bX}};										// wipe data words
+			tb_b_data       = {32{1'bX}};										//
+			tb_n_data       = {32{1'bX}};										//
+			tb_n_coeff_data = {32{1'bX}};										//
+			
+			tb_abn_wren = 0;														// stop filling memories
+		
+		end
+		
+	endtask
+	
+
+		//
+		// BRAM Reader
+		//
+	task read_memory_384;
+
+		output	[383:0] r;
+		reg		[383:0] r_shreg;
+		
+		begin
+			
+			for (w=0; w<NUM_WORDS_384; w=w+1) begin		// read result
+				
+				tb_r_addr = w[3:0];								// set address
+				#10;													// wait for 1 clock tick
+				r_shreg = {tb_r_data, r_shreg[383:32]};	// store data word
+
+			end				
+		
+			tb_r_addr = {4{1'bX}};								// wipe address
+			r = r_shreg;											// return
+
+		end		
+		
+	endtask
+
+
+		//
+		// BRAM Reader
+		//
+	task read_memory_512;
+
+		output	[511:0] r;
+		reg		[511:0] r_shreg;
+		
+		begin
+			
+			for (w=0; w<NUM_WORDS_512; w=w+1) begin		// read result
+				
+				tb_r_addr = w[3:0];								// set address
+				#10;													// wait for 1 clock tick
+				r_shreg = {tb_r_data, r_shreg[511:32]};	// store data word
+
+			end				
+		
+			tb_r_addr = {4{1'bX}};								// wipe address
+			r = r_shreg;											// return
+
+		end		
+		
+	endtask
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================



More information about the Commits mailing list